pinokiod 7.3.5 → 7.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,31 @@
3
3
  const fs = require("fs")
4
4
  const os = require("os")
5
5
  const path = require("path")
6
- const { execFileText, normalizePid } = require("./process_tree")
6
+ const { normalizePid } = require("./process_tree")
7
7
 
8
- const DEFAULT_GPU_TTL_MS = 10000
9
- const DEFAULT_GPU_TIMEOUT_MS = 2500
10
- const DEFAULT_WINDOWS_GPU_COUNTER_TTL_MS = 30000
8
+ const DEFAULT_GPU_TTL_MS = 5000
9
+ const DEFAULT_DRM_FDINFO_MAX_PIDS = 4096
10
+ const DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID = 1024
11
11
  const MIB = 1024 * 1024
12
12
 
13
+ const WINDOWS_GPU_PROCESS_COUNTER = "\\GPU Process Memory(*)\\Dedicated Usage"
14
+ const ERROR_SUCCESS = 0
15
+ const PDH_MORE_DATA = 0x800007D2
16
+ const PDH_INVALID_PATH = 0xC0000BC4
17
+ const PDH_INVALID_DATA = 0xC0000BC6
18
+ const PDH_NO_DATA = 0x800007D5
19
+ const PDH_FMT_LARGE = 0x00000400
20
+
21
+ const NVML_SUCCESS = 0
22
+ const NVML_ERROR_INSUFFICIENT_SIZE = 7
23
+ const NVML_VALUE_NOT_AVAILABLE = 0xFFFFFFFFFFFFFFFFn
24
+
25
+ const AMDSMI_INIT_AMD_GPUS = 1 << 1
26
+ const RSMI_INIT_DEFAULT = 0
27
+
28
+ let koffiModule
29
+ const koffiTypeCache = new WeakMap()
30
+
13
31
  function unique(values) {
14
32
  const seen = new Set()
15
33
  const next = []
@@ -21,54 +39,198 @@ function unique(values) {
21
39
  return next
22
40
  }
23
41
 
24
- function pathExists(filepath) {
42
+ function loadKoffi() {
43
+ if (koffiModule !== undefined) {
44
+ return koffiModule
45
+ }
25
46
  try {
26
- fs.accessSync(filepath, fs.constants.X_OK)
27
- return true
47
+ koffiModule = require("koffi")
28
48
  } catch (_) {
29
- return false
49
+ koffiModule = null
30
50
  }
51
+ return koffiModule
52
+ }
53
+
54
+ function getCachedKoffiTypes(koffi, key, factory) {
55
+ let cache = koffiTypeCache.get(koffi)
56
+ if (!cache) {
57
+ cache = new Map()
58
+ koffiTypeCache.set(koffi, cache)
59
+ }
60
+ if (!cache.has(key)) {
61
+ cache.set(key, factory())
62
+ }
63
+ return cache.get(key)
64
+ }
65
+
66
+ function getWindowsPdhTypes(koffi) {
67
+ return getCachedKoffiTypes(koffi, "windows-pdh", () => {
68
+ const counterValue = koffi.struct("PDH_FMT_COUNTERVALUE", {
69
+ CStatus: "uint32_t",
70
+ largeValue: "int64_t"
71
+ })
72
+ const counterInfo = koffi.struct("PDH_COUNTER_INFO_W_PREFIX", {
73
+ dwLength: "uint32_t",
74
+ dwType: "uint32_t",
75
+ CVersion: "uint32_t",
76
+ CStatus: "uint32_t",
77
+ lScale: "int32_t",
78
+ lDefaultScale: "int32_t",
79
+ dwUserData: "uintptr_t",
80
+ dwQueryUserData: "uintptr_t",
81
+ szFullPath: "str16"
82
+ })
83
+ return { counterValue, counterInfo }
84
+ })
85
+ }
86
+
87
+ function getNvmlTypes(koffi) {
88
+ return getCachedKoffiTypes(koffi, "nvml", () => {
89
+ const processInfoV1 = koffi.struct("nvmlProcessInfo_v1_t", {
90
+ pid: "uint32_t",
91
+ usedGpuMemory: "uint64_t"
92
+ })
93
+ const processInfoV2 = koffi.struct("nvmlProcessInfo_v2_t", {
94
+ pid: "uint32_t",
95
+ usedGpuMemory: "uint64_t",
96
+ gpuInstanceId: "uint32_t",
97
+ computeInstanceId: "uint32_t"
98
+ })
99
+ return { processInfoV1, processInfoV2 }
100
+ })
101
+ }
102
+
103
+ function getAmdSmiTypes(koffi) {
104
+ return getCachedKoffiTypes(koffi, "amdsmi", () => {
105
+ const engineUsage = koffi.struct("amdsmi_engine_usage_process_t", {
106
+ gfx: "uint64_t",
107
+ enc: "uint64_t",
108
+ reserved: koffi.array("uint32_t", 12)
109
+ })
110
+ const memoryUsage = koffi.struct("amdsmi_memory_usage_process_t", {
111
+ gtt_mem: "uint64_t",
112
+ cpu_mem: "uint64_t",
113
+ vram_mem: "uint64_t",
114
+ reserved: koffi.array("uint32_t", 10)
115
+ })
116
+ const procInfo = koffi.struct("amdsmi_proc_info_t", {
117
+ name: koffi.array("char", 256),
118
+ pid: "uint32_t",
119
+ mem: "uint64_t",
120
+ engine_usage: engineUsage,
121
+ memory_usage: memoryUsage,
122
+ container_name: koffi.array("char", 256),
123
+ cu_occupancy: "uint32_t",
124
+ evicted_time: "uint32_t",
125
+ reserved: koffi.array("uint32_t", 10)
126
+ })
127
+ return { procInfo }
128
+ })
129
+ }
130
+
131
+ function getRocmSmiTypes(koffi) {
132
+ return getCachedKoffiTypes(koffi, "rocm-smi", () => {
133
+ const procInfo = koffi.struct("rsmi_process_info_t", {
134
+ process_id: "uint32_t",
135
+ pasid: "uint32_t",
136
+ vram_usage: "uint64_t",
137
+ sdma_usage: "uint64_t",
138
+ cu_occupancy: "uint32_t"
139
+ })
140
+ return { procInfo }
141
+ })
31
142
  }
32
143
 
33
- function executableCandidates(candidates) {
144
+ function existingLibraryCandidates(candidates) {
34
145
  return unique(candidates).filter((candidate) => {
35
146
  if (!candidate) return false
36
- if (path.isAbsolute(candidate)) {
37
- return pathExists(candidate)
147
+ if (!path.isAbsolute(candidate)) return true
148
+ try {
149
+ return fs.existsSync(candidate)
150
+ } catch (_) {
151
+ return false
38
152
  }
39
- return true
40
153
  })
41
154
  }
42
155
 
43
- function getPinokioCondaCandidates(kernel, names) {
44
- if (!kernel || !kernel.homedir) {
45
- return []
156
+ function rocmLibraryCandidates(filename) {
157
+ const roots = unique([
158
+ process.env.ROCM_PATH,
159
+ process.env.ROCM_HOME,
160
+ "/opt/rocm",
161
+ "/usr",
162
+ "/usr/local"
163
+ ])
164
+ const candidates = [filename]
165
+ for (const root of roots) {
166
+ candidates.push(
167
+ path.join(root, "lib", filename),
168
+ path.join(root, "lib64", filename)
169
+ )
46
170
  }
47
- const prefix = path.resolve(kernel.homedir, "bin", "miniconda")
48
- const suffixes = os.platform() === "win32"
49
- ? ["", ".exe"]
50
- : [""]
51
- const folders = os.platform() === "win32"
52
- ? ["Library/bin", "Scripts", ""]
53
- : ["bin", "Library/bin", ""]
54
- const candidates = []
55
- for (const name of names) {
56
- for (const folder of folders) {
57
- for (const suffix of suffixes) {
58
- candidates.push(path.resolve(prefix, folder, `${name}${suffix}`))
59
- }
171
+ candidates.push(
172
+ path.join("/usr/lib/x86_64-linux-gnu", filename),
173
+ path.join("/usr/lib/aarch64-linux-gnu", filename),
174
+ path.join("/usr/local/lib", filename)
175
+ )
176
+ return existingLibraryCandidates(candidates)
177
+ }
178
+
179
+ function loadFirstLibrary(koffi, candidates, options = {}) {
180
+ let lastError = null
181
+ for (const candidate of existingLibraryCandidates(candidates)) {
182
+ try {
183
+ return koffi.load(candidate, options)
184
+ } catch (error) {
185
+ lastError = error
60
186
  }
61
187
  }
62
- return candidates
188
+ throw lastError || new Error("native GPU library unavailable")
189
+ }
190
+
191
+ function optionalFunction(library, definitions) {
192
+ for (const definition of definitions) {
193
+ try {
194
+ return library.func(definition)
195
+ } catch (_) {}
196
+ }
197
+ return null
198
+ }
199
+
200
+ function statusCode(value) {
201
+ return Number(value) >>> 0
202
+ }
203
+
204
+ function isStatus(value, expected) {
205
+ return statusCode(value) === (expected >>> 0)
206
+ }
207
+
208
+ function isSuccess(value) {
209
+ return isStatus(value, ERROR_SUCCESS)
210
+ }
211
+
212
+ function isNoDataStatus(value) {
213
+ return isStatus(value, PDH_INVALID_PATH) || isStatus(value, PDH_INVALID_DATA) || isStatus(value, PDH_NO_DATA)
214
+ }
215
+
216
+ function toSafeNumber(value) {
217
+ if (typeof value === "bigint") {
218
+ if (value < 0n || value > BigInt(Number.MAX_SAFE_INTEGER)) return null
219
+ return Number(value)
220
+ }
221
+ const number = Number(value)
222
+ if (!Number.isFinite(number) || number < 0) return null
223
+ return number
63
224
  }
64
225
 
65
226
  function parseMemoryToBytes(value, defaultUnit = "") {
66
227
  if (value == null) return null
67
- if (typeof value === "number") {
68
- if (!Number.isFinite(value) || value < 0) return null
69
- if (defaultUnit === "mib") return Math.round(value * MIB)
70
- if (defaultUnit === "kb") return Math.round(value * 1024)
71
- return Math.round(value)
228
+ if (typeof value === "number" || typeof value === "bigint") {
229
+ const number = toSafeNumber(value)
230
+ if (number == null) return null
231
+ if (defaultUnit === "mib") return Math.round(number * MIB)
232
+ if (defaultUnit === "kb") return Math.round(number * 1024)
233
+ return Math.round(number)
72
234
  }
73
235
  const raw = String(value).trim()
74
236
  if (!raw || /N\/A|not supported|none/i.test(raw)) {
@@ -112,112 +274,706 @@ function mergeGpuProcess(processes, pid, bytes) {
112
274
  processes.set(normalizedPid, current)
113
275
  }
114
276
 
115
- function parseNvidiaCsv(stdout) {
116
- const processes = new Map()
117
- for (const line of String(stdout || "").split(/\r?\n/)) {
118
- const trimmed = line.trim()
119
- if (!trimmed) continue
120
- const parts = trimmed.split(",").map((part) => part.trim())
121
- const pid = normalizePid(parts[0])
122
- const bytes = parseMemoryToBytes(parts[1], "mib")
123
- addGpuProcess(processes, pid, bytes)
277
+ function normalizePidSet(values) {
278
+ const pids = []
279
+ for (const value of values || []) {
280
+ const pid = normalizePid(value)
281
+ if (pid) pids.push(pid)
124
282
  }
125
- return processes
283
+ return Array.from(new Set(pids)).sort((a, b) => a - b)
126
284
  }
127
285
 
128
- function parseCsvLine(line) {
129
- const values = []
130
- let current = ""
131
- let inQuotes = false
132
- const text = String(line || "")
133
- for (let i = 0; i < text.length; i += 1) {
134
- const char = text[i]
135
- if (char === "\"") {
136
- if (inQuotes && text[i + 1] === "\"") {
137
- current += "\""
138
- i += 1
139
- } else {
140
- inQuotes = !inQuotes
141
- }
142
- } else if (char === "," && !inQuotes) {
143
- values.push(current)
144
- current = ""
145
- } else {
146
- current += char
286
+ function filterProcessMap(processes, pids) {
287
+ const targetPids = normalizePidSet(pids)
288
+ if (targetPids.length === 0 && pids != null) {
289
+ return new Map()
290
+ }
291
+ if (targetPids.length === 0) {
292
+ return processes
293
+ }
294
+ const targetSet = new Set(targetPids)
295
+ const filtered = new Map()
296
+ for (const entry of processes.values()) {
297
+ if (targetSet.has(entry.pid)) {
298
+ filtered.set(entry.pid, entry)
147
299
  }
148
300
  }
149
- values.push(current)
150
- return values
301
+ return filtered
302
+ }
303
+
304
+ function coveredPids(processes) {
305
+ return new Set(Array.from(processes.keys()))
306
+ }
307
+
308
+ function hasUncoveredTarget(pids, covered) {
309
+ const targetPids = normalizePidSet(pids)
310
+ if (pids == null) return true
311
+ if (targetPids.length === 0) return false
312
+ for (const pid of targetPids) {
313
+ if (!covered.has(pid)) return true
314
+ }
315
+ return false
151
316
  }
152
317
 
153
- function parseWindowsGpuProcessMemoryCsv(stdout) {
154
- const rows = []
318
+ function extractPidFromWindowsGpuInstance(instanceName) {
319
+ const match = /(?:^|[^a-z0-9])pid[_\s-]*(\d+)(?:\D|$)/i.exec(String(instanceName || ""))
320
+ return normalizePid(match && match[1])
321
+ }
322
+
323
+ function decodeWindowsMultiSz(buffer, charCount) {
324
+ const values = []
325
+ let start = 0
326
+ const count = Math.max(0, Math.min(charCount || 0, Math.floor(buffer.length / 2)))
327
+ for (let i = 0; i < count; i += 1) {
328
+ const char = buffer.readUInt16LE(i * 2)
329
+ if (char !== 0) continue
330
+ if (i === start) break
331
+ values.push(buffer.subarray(start * 2, i * 2).toString("utf16le"))
332
+ start = i + 1
333
+ }
334
+ return values.filter(Boolean)
335
+ }
336
+
337
+ function isDedicatedDrmMemoryRegion(region) {
338
+ const normalized = String(region || "")
339
+ .trim()
340
+ .toLowerCase()
341
+ .replace(/[_\s]+/g, "-")
342
+ const compact = normalized.replace(/[^a-z0-9]/g, "")
343
+ if (!compact || /^(system|gtt|memory|shared|stolen|cpu|host)\d*$/.test(compact)) {
344
+ return false
345
+ }
346
+ return /^vram\d*$/.test(compact) || /^local\d*$/.test(compact)
347
+ }
348
+
349
+ function parseLinuxDrmFdinfo(stdout) {
350
+ const fields = new Map()
155
351
  for (const line of String(stdout || "").split(/\r?\n/)) {
156
- const trimmed = line.trim()
157
- if (!trimmed || !trimmed.startsWith("\"")) continue
158
- const row = parseCsvLine(trimmed)
159
- if (row.length > 1) rows.push(row)
352
+ const separator = line.indexOf(":")
353
+ if (separator < 0) continue
354
+ const key = line.slice(0, separator).trim().toLowerCase()
355
+ const value = line.slice(separator + 1).trim()
356
+ if (key) fields.set(key, value)
160
357
  }
161
- if (rows.length < 2) {
162
- return new Map()
358
+
359
+ const driver = fields.get("drm-driver")
360
+ if (!driver) {
361
+ return null
362
+ }
363
+
364
+ let residentBytes = 0
365
+ let legacyMemoryBytes = 0
366
+ let hasResidentDedicatedMemory = false
367
+ for (const [key, value] of fields.entries()) {
368
+ const match = /^drm-(resident|memory)-(.+)$/.exec(key)
369
+ if (!match || !isDedicatedDrmMemoryRegion(match[2])) continue
370
+ const bytes = parseMemoryToBytes(value)
371
+ if (!Number.isFinite(bytes) || bytes < 0) continue
372
+ if (match[1] === "resident") {
373
+ hasResidentDedicatedMemory = true
374
+ residentBytes += bytes
375
+ } else {
376
+ legacyMemoryBytes += bytes
377
+ }
378
+ }
379
+
380
+ return {
381
+ driver,
382
+ pdev: fields.get("drm-pdev") || "",
383
+ clientId: fields.get("drm-client-id") || "",
384
+ dedicatedBytes: hasResidentDedicatedMemory ? residentBytes : legacyMemoryBytes
163
385
  }
164
- const headers = rows[0]
165
- const values = rows[rows.length - 1]
386
+ }
387
+
388
+ async function collectLinuxDrmFdinfoProcesses(pids, options = {}) {
389
+ const procRoot = options.procRoot || "/proc"
390
+ const maxPids = options.maxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
391
+ const maxFdsPerPid = options.maxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
392
+ const targetPids = normalizePidSet(pids).slice(0, maxPids)
393
+ const byClient = new Map()
394
+
395
+ for (const pid of targetPids) {
396
+ const fdinfoDir = path.join(procRoot, String(pid), "fdinfo")
397
+ let entries = []
398
+ try {
399
+ entries = await fs.promises.readdir(fdinfoDir, { withFileTypes: true })
400
+ } catch (_) {
401
+ continue
402
+ }
403
+
404
+ let scannedFds = 0
405
+ for (const entry of entries) {
406
+ const name = entry && entry.name ? entry.name : ""
407
+ if (!/^\d+$/.test(name)) continue
408
+ scannedFds += 1
409
+ if (scannedFds > maxFdsPerPid) break
410
+
411
+ let stdout = ""
412
+ try {
413
+ stdout = await fs.promises.readFile(path.join(fdinfoDir, name), "utf8")
414
+ } catch (_) {
415
+ continue
416
+ }
417
+
418
+ const parsed = parseLinuxDrmFdinfo(stdout)
419
+ if (!parsed || !(parsed.dedicatedBytes > 0)) continue
420
+ const clientKey = parsed.clientId
421
+ ? `client:${parsed.clientId}`
422
+ : "unknown-client"
423
+ const key = [
424
+ pid,
425
+ parsed.driver || "unknown-driver",
426
+ parsed.pdev || "unknown-device",
427
+ clientKey
428
+ ].join(":")
429
+ const current = byClient.get(key)
430
+ byClient.set(key, {
431
+ pid,
432
+ bytes: current ? Math.max(current.bytes, parsed.dedicatedBytes) : parsed.dedicatedBytes
433
+ })
434
+ }
435
+ }
436
+
166
437
  const processes = new Map()
167
- for (let i = 1; i < headers.length && i < values.length; i += 1) {
168
- const instanceName = String(headers[i] || "")
169
- const match = /pid[_\s-]*(\d+)/i.exec(instanceName)
170
- const pid = normalizePid(match && match[1])
171
- const bytes = parseMemoryToBytes(values[i])
172
- addGpuProcess(processes, pid, bytes)
438
+ for (const entry of byClient.values()) {
439
+ addGpuProcess(processes, entry.pid, entry.bytes)
173
440
  }
174
441
  return processes
175
442
  }
176
443
 
177
- function findObjectValue(object, predicate) {
178
- if (!object || typeof object !== "object" || Array.isArray(object)) {
444
+ class WindowsPdhGpuMemoryClient {
445
+ constructor(options = {}) {
446
+ this.koffi = options.koffi || loadKoffi()
447
+ this.library = null
448
+ this.query = null
449
+ this.counters = []
450
+ this.counterValueType = null
451
+ this.counterInfoType = null
452
+ this.functions = null
453
+ this.counterRefreshMs = options.counterRefreshMs || DEFAULT_GPU_TTL_MS
454
+ this.lastCounterRefreshAt = 0
455
+ }
456
+
457
+ init() {
458
+ if (this.functions) return
459
+ if (!this.koffi) {
460
+ throw new Error("koffi unavailable")
461
+ }
462
+
463
+ const types = getWindowsPdhTypes(this.koffi)
464
+ this.counterValueType = types.counterValue
465
+ this.counterInfoType = types.counterInfo
466
+
467
+ this.library = this.koffi.load("pdh.dll")
468
+ this.functions = {
469
+ openQuery: this.library.func("uint32_t __stdcall PdhOpenQueryW(const char16_t *szDataSource, uintptr_t dwUserData, _Out_ void **phQuery)"),
470
+ addEnglishCounter: this.library.func("uint32_t __stdcall PdhAddEnglishCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
471
+ addCounter: this.library.func("uint32_t __stdcall PdhAddCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
472
+ collectQueryData: this.library.func("uint32_t __stdcall PdhCollectQueryData(void *hQuery)"),
473
+ getCounterInfo: this.library.func("uint32_t __stdcall PdhGetCounterInfoW(void *hCounter, int bRetrieveExplainText, _Inout_ uint32_t *pdwBufferSize, _Out_ void *lpBuffer)"),
474
+ expandWildCardPath: this.library.func("uint32_t __stdcall PdhExpandWildCardPathW(const char16_t *szDataSource, const char16_t *szWildCardPath, _Out_ char16_t *mszExpandedPathList, _Inout_ uint32_t *pcchPathListLength, uint32_t dwFlags)"),
475
+ getFormattedCounterValue: this.library.func("uint32_t __stdcall PdhGetFormattedCounterValueW(void *hCounter, uint32_t dwFormat, _Out_ uint32_t *lpdwType, _Out_ PDH_FMT_COUNTERVALUE *pValue)"),
476
+ closeQuery: this.library.func("uint32_t __stdcall PdhCloseQuery(void *hQuery)")
477
+ }
478
+ }
479
+
480
+ openQuery() {
481
+ const query = [null]
482
+ const status = this.functions.openQuery(null, 0, query)
483
+ if (!isSuccess(status)) {
484
+ throw new Error(`PdhOpenQueryW failed: 0x${statusCode(status).toString(16)}`)
485
+ }
486
+ return query[0]
487
+ }
488
+
489
+ closeQuery(query) {
490
+ if (!query || !this.functions) return
491
+ try {
492
+ this.functions.closeQuery(query)
493
+ } catch (_) {}
494
+ }
495
+
496
+ getLocalizedWildcardPath() {
497
+ const query = this.openQuery()
498
+ const counter = [null]
499
+ try {
500
+ let status = this.functions.addEnglishCounter(query, WINDOWS_GPU_PROCESS_COUNTER, 0, counter)
501
+ if (!isSuccess(status)) {
502
+ throw new Error(`PdhAddEnglishCounterW failed: 0x${statusCode(status).toString(16)}`)
503
+ }
504
+
505
+ const bufferSize = [0]
506
+ status = this.functions.getCounterInfo(counter[0], 0, bufferSize, null)
507
+ if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
508
+ throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
509
+ }
510
+ if (bufferSize[0] <= 0) {
511
+ return WINDOWS_GPU_PROCESS_COUNTER
512
+ }
513
+
514
+ const buffer = Buffer.alloc(bufferSize[0])
515
+ status = this.functions.getCounterInfo(counter[0], 0, bufferSize, buffer)
516
+ if (!isSuccess(status)) {
517
+ throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
518
+ }
519
+
520
+ const info = this.koffi.decode(buffer, this.counterInfoType)
521
+ return info && info.szFullPath ? info.szFullPath : WINDOWS_GPU_PROCESS_COUNTER
522
+ } finally {
523
+ this.closeQuery(query)
524
+ }
525
+ }
526
+
527
+ expandWildcardPath(wildcardPath) {
528
+ const charCount = [0]
529
+ let status = this.functions.expandWildCardPath(null, wildcardPath, null, charCount, 0)
530
+ if (isNoDataStatus(status)) {
531
+ return []
532
+ }
533
+ if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
534
+ throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
535
+ }
536
+ if (charCount[0] <= 0) {
537
+ return []
538
+ }
539
+
540
+ const buffer = Buffer.alloc(charCount[0] * 2)
541
+ status = this.functions.expandWildCardPath(null, wildcardPath, buffer, charCount, 0)
542
+ if (isNoDataStatus(status)) {
543
+ return []
544
+ }
545
+ if (!isSuccess(status)) {
546
+ throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
547
+ }
548
+ return decodeWindowsMultiSz(buffer, charCount[0])
549
+ }
550
+
551
+ refreshCounters(force = false) {
552
+ const now = Date.now()
553
+ if (!force && this.query && now - this.lastCounterRefreshAt < this.counterRefreshMs) {
554
+ return
555
+ }
556
+
557
+ const paths = this.expandWildcardPath(this.getLocalizedWildcardPath())
558
+ const query = this.openQuery()
559
+ const counters = []
560
+ try {
561
+ for (const counterPath of paths) {
562
+ const pid = extractPidFromWindowsGpuInstance(counterPath)
563
+ if (!pid) continue
564
+ const counter = [null]
565
+ const status = this.functions.addCounter(query, counterPath, 0, counter)
566
+ if (isSuccess(status) && counter[0]) {
567
+ counters.push({ handle: counter[0], pid })
568
+ }
569
+ }
570
+ } catch (error) {
571
+ this.closeQuery(query)
572
+ throw error
573
+ }
574
+
575
+ const previousQuery = this.query
576
+ this.query = counters.length > 0 ? query : null
577
+ this.counters = counters
578
+ this.lastCounterRefreshAt = now
579
+ if (this.query !== query) {
580
+ this.closeQuery(query)
581
+ }
582
+ this.closeQuery(previousQuery)
583
+ }
584
+
585
+ readCounterValue(counter) {
586
+ const type = [0]
587
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.counterValueType))
588
+ const status = this.functions.getFormattedCounterValue(counter.handle, PDH_FMT_LARGE, type, buffer)
589
+ if (isNoDataStatus(status)) {
590
+ return null
591
+ }
592
+ if (!isSuccess(status)) {
593
+ return null
594
+ }
595
+ const value = this.koffi.decode(buffer, this.counterValueType)
596
+ if (!value || !isSuccess(value.CStatus)) {
597
+ return null
598
+ }
599
+ return parseMemoryToBytes(value.largeValue)
600
+ }
601
+
602
+ collect(pids) {
603
+ this.init()
604
+ this.refreshCounters(false)
605
+ if (!this.query || this.counters.length === 0) {
606
+ return new Map()
607
+ }
608
+
609
+ const status = this.functions.collectQueryData(this.query)
610
+ if (isNoDataStatus(status)) {
611
+ return new Map()
612
+ }
613
+ if (!isSuccess(status)) {
614
+ throw new Error(`PdhCollectQueryData failed: 0x${statusCode(status).toString(16)}`)
615
+ }
616
+
617
+ const targetPids = normalizePidSet(pids)
618
+ const targetSet = targetPids.length > 0 ? new Set(targetPids) : null
619
+ const processes = new Map()
620
+ for (const counter of this.counters) {
621
+ if (!counter || (targetSet && !targetSet.has(counter.pid))) continue
622
+ addGpuProcess(processes, counter.pid, this.readCounterValue(counter))
623
+ }
624
+ return processes
625
+ }
626
+
627
+ stop() {
628
+ this.closeQuery(this.query)
629
+ this.query = null
630
+ this.counters = []
631
+ }
632
+ }
633
+
634
+ class NvmlGpuMemoryClient {
635
+ constructor(options = {}) {
636
+ this.koffi = options.koffi || loadKoffi()
637
+ this.library = null
638
+ this.initialized = false
639
+ this.processInfoV1 = null
640
+ this.processInfoV2 = null
641
+ this.functions = null
642
+ }
643
+
644
+ init() {
645
+ if (this.initialized) return
646
+ if (!this.koffi) {
647
+ throw new Error("koffi unavailable")
648
+ }
649
+
650
+ const types = getNvmlTypes(this.koffi)
651
+ this.processInfoV1 = types.processInfoV1
652
+ this.processInfoV2 = types.processInfoV2
653
+
654
+ this.library = loadFirstLibrary(this.koffi, [
655
+ process.env.NVIDIA_ML,
656
+ "libnvidia-ml.so.1",
657
+ "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
658
+ "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
659
+ "/usr/lib64/libnvidia-ml.so.1",
660
+ "/usr/local/nvidia/lib64/libnvidia-ml.so.1"
661
+ ])
662
+ this.functions = {
663
+ init: optionalFunction(this.library, [
664
+ "int nvmlInit_v2(void)",
665
+ "int nvmlInit(void)"
666
+ ]),
667
+ shutdown: optionalFunction(this.library, [
668
+ "int nvmlShutdown(void)"
669
+ ]),
670
+ getCount: optionalFunction(this.library, [
671
+ "int nvmlDeviceGetCount_v2(_Out_ uint32_t *deviceCount)",
672
+ "int nvmlDeviceGetCount(_Out_ uint32_t *deviceCount)"
673
+ ]),
674
+ getHandleByIndex: optionalFunction(this.library, [
675
+ "int nvmlDeviceGetHandleByIndex_v2(uint32_t index, _Out_ void **device)",
676
+ "int nvmlDeviceGetHandleByIndex(uint32_t index, _Out_ void **device)"
677
+ ]),
678
+ compute: this.pickProcessFunction("nvmlDeviceGetComputeRunningProcesses"),
679
+ graphics: this.pickProcessFunction("nvmlDeviceGetGraphicsRunningProcesses"),
680
+ mps: this.pickProcessFunction("nvmlDeviceGetMPSComputeRunningProcesses")
681
+ }
682
+
683
+ if (!this.functions.init || !this.functions.getCount || !this.functions.getHandleByIndex) {
684
+ throw new Error("NVML process API unavailable")
685
+ }
686
+ const status = this.functions.init()
687
+ if (status !== NVML_SUCCESS) {
688
+ throw new Error(`nvmlInit failed: ${status}`)
689
+ }
690
+ this.initialized = true
691
+ }
692
+
693
+ pickProcessFunction(baseName) {
694
+ const candidates = [
695
+ { suffix: "_v3", type: () => this.processInfoV2 },
696
+ { suffix: "_v2", type: () => this.processInfoV2 },
697
+ { suffix: "", type: () => this.processInfoV1 }
698
+ ]
699
+ for (const candidate of candidates) {
700
+ const typeName = candidate.type() === this.processInfoV2 ? "nvmlProcessInfo_v2_t" : "nvmlProcessInfo_v1_t"
701
+ const func = optionalFunction(this.library, [
702
+ `int ${baseName}${candidate.suffix}(void *device, _Inout_ uint32_t *infoCount, _Out_ ${typeName} *infos)`
703
+ ])
704
+ if (func) {
705
+ return { func, type: candidate.type() }
706
+ }
707
+ }
179
708
  return null
180
709
  }
181
- for (const [key, value] of Object.entries(object)) {
182
- if (predicate(key, value)) {
183
- return value
710
+
711
+ getDeviceHandles() {
712
+ const count = [0]
713
+ const status = this.functions.getCount(count)
714
+ if (status !== NVML_SUCCESS) {
715
+ throw new Error(`nvmlDeviceGetCount failed: ${status}`)
184
716
  }
717
+ const handles = []
718
+ for (let i = 0; i < count[0]; i += 1) {
719
+ const handle = [null]
720
+ const handleStatus = this.functions.getHandleByIndex(i, handle)
721
+ if (handleStatus === NVML_SUCCESS && handle[0]) {
722
+ handles.push(handle[0])
723
+ }
724
+ }
725
+ return handles
726
+ }
727
+
728
+ collectProcessList(device, entry) {
729
+ if (!entry || !entry.func) return []
730
+
731
+ let count = [0]
732
+ let status = entry.func(device, count, null)
733
+ if (status === NVML_SUCCESS && count[0] === 0) {
734
+ return []
735
+ }
736
+ if (status !== NVML_SUCCESS && status !== NVML_ERROR_INSUFFICIENT_SIZE) {
737
+ return []
738
+ }
739
+
740
+ let capacity = Math.max(1, count[0] + 8)
741
+ for (let attempt = 0; attempt < 2; attempt += 1) {
742
+ count = [capacity]
743
+ const buffer = Buffer.alloc(this.koffi.sizeof(entry.type) * capacity)
744
+ status = entry.func(device, count, buffer)
745
+ if (status === NVML_SUCCESS) {
746
+ return this.koffi.decode(buffer, entry.type, Math.min(count[0], capacity))
747
+ }
748
+ if (status !== NVML_ERROR_INSUFFICIENT_SIZE || count[0] <= capacity) {
749
+ return []
750
+ }
751
+ capacity = count[0] + 8
752
+ }
753
+ return []
754
+ }
755
+
756
+ collect(pids = null) {
757
+ this.init()
758
+ const processes = new Map()
759
+ for (const device of this.getDeviceHandles()) {
760
+ const deviceProcesses = new Map()
761
+ for (const entry of [this.functions.compute, this.functions.graphics, this.functions.mps]) {
762
+ for (const processInfo of this.collectProcessList(device, entry)) {
763
+ if (!processInfo) continue
764
+ const pid = normalizePid(processInfo.pid)
765
+ if (!pid) continue
766
+ if (typeof processInfo.usedGpuMemory === "bigint" && processInfo.usedGpuMemory === NVML_VALUE_NOT_AVAILABLE) {
767
+ continue
768
+ }
769
+ const bytes = parseMemoryToBytes(processInfo.usedGpuMemory)
770
+ mergeGpuProcess(deviceProcesses, pid, bytes)
771
+ }
772
+ }
773
+ for (const entry of deviceProcesses.values()) {
774
+ addGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
775
+ }
776
+ }
777
+ return filterProcessMap(processes, pids)
778
+ }
779
+
780
+ stop() {
781
+ if (this.initialized && this.functions && this.functions.shutdown) {
782
+ try {
783
+ this.functions.shutdown()
784
+ } catch (_) {}
785
+ }
786
+ this.initialized = false
185
787
  }
186
- return null
187
788
  }
188
789
 
189
- function extractAmdProcessesFromJson(value, processes = new Map()) {
190
- if (Array.isArray(value)) {
191
- for (const item of value) {
192
- extractAmdProcessesFromJson(item, processes)
790
+ class AmdSmiGpuMemoryClient {
791
+ constructor(options = {}) {
792
+ this.koffi = options.koffi || loadKoffi()
793
+ this.library = null
794
+ this.initialized = false
795
+ this.procInfoType = null
796
+ this.functions = null
797
+ }
798
+
799
+ init() {
800
+ if (this.initialized) return
801
+ if (!this.koffi) {
802
+ throw new Error("koffi unavailable")
193
803
  }
194
- return processes
804
+
805
+ this.procInfoType = getAmdSmiTypes(this.koffi).procInfo
806
+
807
+ this.library = loadFirstLibrary(this.koffi, [
808
+ process.env.AMD_SMI_LIBRARY,
809
+ ...rocmLibraryCandidates("libamd_smi.so")
810
+ ])
811
+ this.functions = {
812
+ init: this.library.func("int amdsmi_init(uint64_t init_flags)"),
813
+ shutdown: optionalFunction(this.library, [
814
+ "int amdsmi_shut_down(void)"
815
+ ]),
816
+ getSocketHandles: this.library.func("int amdsmi_get_socket_handles(_Inout_ uint32_t *socket_count, _Out_ void **socket_handles)"),
817
+ getProcessorHandles: this.library.func("int amdsmi_get_processor_handles(void *socket_handle, _Inout_ uint32_t *processor_count, _Out_ void **processor_handles)"),
818
+ getProcessList: this.library.func("int amdsmi_get_gpu_process_list(void *processor_handle, _Inout_ uint32_t *max_processes, _Out_ amdsmi_proc_info_t *list)")
819
+ }
820
+
821
+ const status = this.functions.init(AMDSMI_INIT_AMD_GPUS)
822
+ if (status !== 0) {
823
+ throw new Error(`amdsmi_init failed: ${status}`)
824
+ }
825
+ this.initialized = true
195
826
  }
196
- if (!value || typeof value !== "object") {
197
- return processes
827
+
828
+ readPointerArray(countFunction) {
829
+ let count = [0]
830
+ let status = countFunction(count, null)
831
+ if (status !== 0 && count[0] === 0) {
832
+ return []
833
+ }
834
+ if (count[0] <= 0) {
835
+ return []
836
+ }
837
+ const pointerSize = this.koffi.sizeof("void *")
838
+ const buffer = Buffer.alloc(pointerSize * count[0])
839
+ status = countFunction(count, buffer)
840
+ if (status !== 0) {
841
+ return []
842
+ }
843
+ return this.koffi.decode(buffer, "uintptr_t", count[0]).filter(Boolean)
198
844
  }
199
845
 
200
- const pidValue = findObjectValue(value, (key) => /(^|[_\s-])pid$|process[_\s-]*id/i.test(key))
201
- const memoryValue = findObjectValue(value, (key) => {
202
- const normalized = key.toLowerCase()
203
- if (/total|free|available|limit/.test(normalized)) return false
204
- return /vram|memory/.test(normalized) && /usage|used|mem|size/.test(normalized)
205
- })
206
- const pid = normalizePid(pidValue)
207
- const bytes = parseMemoryToBytes(memoryValue)
208
- addGpuProcess(processes, pid, bytes)
846
+ getProcessorHandles() {
847
+ const sockets = this.readPointerArray((count, buffer) => {
848
+ return this.functions.getSocketHandles(count, buffer)
849
+ })
850
+ const processors = []
851
+ for (const socket of sockets) {
852
+ processors.push(...this.readPointerArray((count, buffer) => {
853
+ return this.functions.getProcessorHandles(socket, count, buffer)
854
+ }))
855
+ }
856
+ return processors
857
+ }
209
858
 
210
- for (const child of Object.values(value)) {
211
- if (child && typeof child === "object") {
212
- extractAmdProcessesFromJson(child, processes)
859
+ collectProcessorProcesses(processor) {
860
+ let count = [0]
861
+ let status = this.functions.getProcessList(processor, count, null)
862
+ if (status !== 0 && count[0] === 0) {
863
+ return []
213
864
  }
865
+ if (count[0] <= 0) {
866
+ return []
867
+ }
868
+
869
+ let capacity = count[0]
870
+ for (let attempt = 0; attempt < 2; attempt += 1) {
871
+ count = [capacity]
872
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * capacity)
873
+ status = this.functions.getProcessList(processor, count, buffer)
874
+ if (status === 0) {
875
+ return this.koffi.decode(buffer, this.procInfoType, Math.min(count[0], capacity))
876
+ }
877
+ if (count[0] <= capacity) {
878
+ return []
879
+ }
880
+ capacity = count[0]
881
+ }
882
+ return []
883
+ }
884
+
885
+ collect(pids = null) {
886
+ this.init()
887
+ const processes = new Map()
888
+ for (const processor of this.getProcessorHandles()) {
889
+ for (const entry of this.collectProcessorProcesses(processor)) {
890
+ if (!entry) continue
891
+ const bytes = parseMemoryToBytes(entry.memory_usage && entry.memory_usage.vram_mem)
892
+ addGpuProcess(processes, entry.pid, bytes)
893
+ }
894
+ }
895
+ return filterProcessMap(processes, pids)
896
+ }
897
+
898
+ stop() {
899
+ if (this.initialized && this.functions && this.functions.shutdown) {
900
+ try {
901
+ this.functions.shutdown()
902
+ } catch (_) {}
903
+ }
904
+ this.initialized = false
214
905
  }
215
- return processes
216
906
  }
217
907
 
218
- function parseAmdJson(stdout) {
219
- const parsed = JSON.parse(stdout || "[]")
220
- return extractAmdProcessesFromJson(parsed)
908
+ class RocmSmiGpuMemoryClient {
909
+ constructor(options = {}) {
910
+ this.koffi = options.koffi || loadKoffi()
911
+ this.library = null
912
+ this.initialized = false
913
+ this.procInfoType = null
914
+ this.functions = null
915
+ }
916
+
917
+ init() {
918
+ if (this.initialized) return
919
+ if (!this.koffi) {
920
+ throw new Error("koffi unavailable")
921
+ }
922
+
923
+ this.procInfoType = getRocmSmiTypes(this.koffi).procInfo
924
+
925
+ this.library = loadFirstLibrary(this.koffi, [
926
+ process.env.ROCM_SMI_LIBRARY,
927
+ ...rocmLibraryCandidates("librocm_smi64.so")
928
+ ])
929
+ this.functions = {
930
+ init: this.library.func("int rsmi_init(uint64_t init_flags)"),
931
+ shutdown: optionalFunction(this.library, [
932
+ "int rsmi_shut_down(void)"
933
+ ]),
934
+ getProcessInfo: this.library.func("int rsmi_compute_process_info_get(_Out_ rsmi_process_info_t *procs, _Inout_ uint32_t *num_items)")
935
+ }
936
+
937
+ const status = this.functions.init(RSMI_INIT_DEFAULT)
938
+ if (status !== 0) {
939
+ throw new Error(`rsmi_init failed: ${status}`)
940
+ }
941
+ this.initialized = true
942
+ }
943
+
944
+ collect(pids = null) {
945
+ this.init()
946
+ let count = [0]
947
+ let status = this.functions.getProcessInfo(null, count)
948
+ if (status !== 0 && count[0] === 0) {
949
+ return new Map()
950
+ }
951
+ if (count[0] <= 0) {
952
+ return new Map()
953
+ }
954
+
955
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * count[0])
956
+ status = this.functions.getProcessInfo(buffer, count)
957
+ if (status !== 0) {
958
+ return new Map()
959
+ }
960
+
961
+ const processes = new Map()
962
+ for (const entry of this.koffi.decode(buffer, this.procInfoType, count[0])) {
963
+ if (!entry) continue
964
+ addGpuProcess(processes, entry.process_id, parseMemoryToBytes(entry.vram_usage))
965
+ }
966
+ return filterProcessMap(processes, pids)
967
+ }
968
+
969
+ stop() {
970
+ if (this.initialized && this.functions && this.functions.shutdown) {
971
+ try {
972
+ this.functions.shutdown()
973
+ } catch (_) {}
974
+ }
975
+ this.initialized = false
976
+ }
221
977
  }
222
978
 
223
979
  class GpuSampler {
@@ -225,56 +981,19 @@ class GpuSampler {
225
981
  this.kernel = options.kernel || null
226
982
  this.platform = options.platform || (this.kernel && this.kernel.platform) || os.platform()
227
983
  this.ttlMs = options.ttlMs || DEFAULT_GPU_TTL_MS
228
- this.timeoutMs = options.timeoutMs || DEFAULT_GPU_TIMEOUT_MS
229
- this.windowsCounterTtlMs = options.windowsCounterTtlMs || DEFAULT_WINDOWS_GPU_COUNTER_TTL_MS
984
+ this.procRoot = options.procRoot || "/proc"
985
+ this.drmFdinfoMaxPids = options.drmFdinfoMaxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
986
+ this.drmFdinfoMaxFdsPerPid = options.drmFdinfoMaxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
987
+ this.windowsPdhClient = options.windowsPdhClient || null
988
+ this.nvmlClient = options.nvmlClient || null
989
+ this.amdSmiClient = options.amdSmiClient || null
990
+ this.rocmSmiClient = options.rocmSmiClient || null
230
991
  this.current = null
992
+ this.currentCacheKey = null
231
993
  this.inFlight = null
232
- this.windowsCounterCurrent = null
233
- this.windowsCounterInFlight = null
994
+ this.inFlightCacheKey = null
234
995
  this.providerBackoff = new Map()
235
- }
236
-
237
- nvidiaCandidates() {
238
- const candidates = [
239
- process.env.NVIDIA_SMI,
240
- "nvidia-smi",
241
- ...getPinokioCondaCandidates(this.kernel, ["nvidia-smi"])
242
- ]
243
- if (this.platform === "win32") {
244
- candidates.push(
245
- "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe",
246
- "C:\\Windows\\System32\\nvidia-smi.exe"
247
- )
248
- } else if (this.platform === "linux") {
249
- candidates.push(
250
- "/usr/bin/nvidia-smi",
251
- "/usr/local/bin/nvidia-smi",
252
- "/usr/local/nvidia/bin/nvidia-smi",
253
- "/usr/local/cuda/bin/nvidia-smi"
254
- )
255
- }
256
- return executableCandidates(candidates)
257
- }
258
-
259
- windowsGpuCounterCandidates() {
260
- return executableCandidates([
261
- process.env.TYPEPERF,
262
- "typeperf",
263
- "C:\\Windows\\System32\\typeperf.exe",
264
- "C:\\Windows\\Sysnative\\typeperf.exe"
265
- ])
266
- }
267
-
268
- amdCandidates() {
269
- const candidates = [
270
- process.env.AMD_SMI,
271
- "amd-smi",
272
- ...getPinokioCondaCandidates(this.kernel, ["amd-smi"])
273
- ]
274
- if (this.platform === "linux") {
275
- candidates.push("/opt/rocm/bin/amd-smi", "/usr/bin/amd-smi", "/usr/local/bin/amd-smi")
276
- }
277
- return executableCandidates(candidates)
996
+ this.providerLogBackoff = new Map()
278
997
  }
279
998
 
280
999
  isBackedOff(provider) {
@@ -286,139 +1005,161 @@ class GpuSampler {
286
1005
  this.providerBackoff.set(provider, Date.now() + ms)
287
1006
  }
288
1007
 
289
- async collectWindowsGpuProcessMemoryOnce() {
290
- if (this.platform !== "win32" || this.isBackedOff("windows-gpu-process-memory")) {
291
- return null
1008
+ logProviderFailure(provider, error, pids, fallbackMessage = "GPU provider unavailable", ms = 60000) {
1009
+ const now = Date.now()
1010
+ const until = this.providerLogBackoff.get(provider) || 0
1011
+ if (now < until) return
1012
+ this.providerLogBackoff.set(provider, now + ms)
1013
+
1014
+ const summary = {
1015
+ provider,
1016
+ platform: this.platform,
1017
+ pid_count: normalizePidSet(pids).length,
1018
+ error: error && error.message ? error.message : fallbackMessage
292
1019
  }
293
- let lastError = null
294
- for (const command of this.windowsGpuCounterCandidates()) {
295
- try {
296
- const { stdout } = await execFileText(command, [
297
- "\\GPU Process Memory(*)\\Dedicated Usage",
298
- "-sc",
299
- "1"
300
- ], { timeoutMs: Math.max(this.timeoutMs, 3000) })
301
- return {
302
- provider: "windows-gpu-process-memory",
303
- processes: parseWindowsGpuProcessMemoryCsv(stdout),
304
- error: null,
305
- collectedAt: Date.now()
306
- }
307
- } catch (error) {
308
- lastError = error
309
- if (error && error.code === "ENOENT") {
310
- continue
311
- }
312
- break
313
- }
1020
+ const code = error && (error.code || error.errno || error.status)
1021
+ if (code != null) {
1022
+ summary.code = String(code)
314
1023
  }
315
- this.backoff("windows-gpu-process-memory", 60000)
316
- return {
317
- provider: "windows-gpu-process-memory",
318
- processes: new Map(),
319
- error: lastError && lastError.message ? lastError.message : "Windows GPU process memory counters unavailable",
320
- collectedAt: Date.now()
1024
+ try {
1025
+ console.warn("[resource-usage:gpu] provider failed", summary)
1026
+ } catch (_) {}
1027
+ }
1028
+
1029
+ getWindowsPdhClient() {
1030
+ if (!this.windowsPdhClient) {
1031
+ this.windowsPdhClient = new WindowsPdhGpuMemoryClient()
321
1032
  }
1033
+ return this.windowsPdhClient
322
1034
  }
323
1035
 
324
- async collectWindowsGpuProcessMemory() {
325
- if (this.platform !== "win32" || this.isBackedOff("windows-gpu-process-memory")) {
326
- return null
1036
+ getNvmlClient() {
1037
+ if (!this.nvmlClient) {
1038
+ this.nvmlClient = new NvmlGpuMemoryClient()
327
1039
  }
328
- const now = Date.now()
329
- if (this.windowsCounterCurrent && now - this.windowsCounterCurrent.collectedAt < this.windowsCounterTtlMs) {
330
- return this.windowsCounterCurrent
1040
+ return this.nvmlClient
1041
+ }
1042
+
1043
+ getAmdSmiClient() {
1044
+ if (!this.amdSmiClient) {
1045
+ this.amdSmiClient = new AmdSmiGpuMemoryClient()
331
1046
  }
332
- if (this.windowsCounterInFlight) {
333
- return this.windowsCounterInFlight
1047
+ return this.amdSmiClient
1048
+ }
1049
+
1050
+ getRocmSmiClient() {
1051
+ if (!this.rocmSmiClient) {
1052
+ this.rocmSmiClient = new RocmSmiGpuMemoryClient()
334
1053
  }
335
- this.windowsCounterInFlight = this.collectWindowsGpuProcessMemoryOnce().then((result) => {
336
- if (result && !result.error) {
337
- this.windowsCounterCurrent = result
338
- }
339
- return result
340
- }).finally(() => {
341
- this.windowsCounterInFlight = null
342
- })
343
- return this.windowsCounterInFlight
1054
+ return this.rocmSmiClient
344
1055
  }
345
1056
 
346
- async collectNvidia() {
347
- if (this.isBackedOff("nvidia")) {
1057
+ async collectWindowsPdh(pids) {
1058
+ if (this.platform !== "win32" || this.isBackedOff("windows-pdh")) {
348
1059
  return null
349
1060
  }
350
- const args = [
351
- "--query-compute-apps=pid,used_gpu_memory",
352
- "--format=csv,noheader,nounits"
353
- ]
354
- let lastError = null
355
- for (const command of this.nvidiaCandidates()) {
356
- try {
357
- const { stdout } = await execFileText(command, args, { timeoutMs: this.timeoutMs })
358
- return {
359
- provider: "nvidia-smi",
360
- processes: parseNvidiaCsv(stdout),
361
- error: null
362
- }
363
- } catch (error) {
364
- lastError = error
365
- if (error && error.code === "ENOENT") {
366
- continue
367
- }
368
- break
1061
+ try {
1062
+ return {
1063
+ provider: "windows-pdh",
1064
+ processes: this.getWindowsPdhClient().collect(pids),
1065
+ error: null
1066
+ }
1067
+ } catch (error) {
1068
+ this.logProviderFailure("windows-pdh", error, pids, "Windows PDH unavailable")
1069
+ this.backoff("windows-pdh", 60000)
1070
+ return {
1071
+ provider: "windows-pdh",
1072
+ processes: new Map(),
1073
+ error: error && error.message ? error.message : "Windows PDH unavailable"
369
1074
  }
370
- }
371
- this.backoff("nvidia", 60000)
372
- return {
373
- provider: "nvidia-smi",
374
- processes: new Map(),
375
- error: lastError && lastError.message ? lastError.message : "nvidia-smi unavailable"
376
1075
  }
377
1076
  }
378
1077
 
379
- async collectAmd() {
380
- if (this.platform !== "linux" || this.isBackedOff("amd")) {
1078
+ async collectLinuxDrmFdinfo(pids) {
1079
+ if (this.platform !== "linux" || this.isBackedOff("linux-drm-fdinfo") || pids == null) {
381
1080
  return null
382
1081
  }
383
- let lastError = null
384
- for (const command of this.amdCandidates()) {
385
- try {
386
- const { stdout } = await execFileText(command, ["process", "--json", "-G"], { timeoutMs: this.timeoutMs })
387
- return {
388
- provider: "amd-smi",
389
- processes: parseAmdJson(stdout),
390
- error: null
391
- }
392
- } catch (error) {
393
- lastError = error
394
- if (error && error.code === "ENOENT") {
395
- continue
396
- }
397
- break
398
- }
1082
+ const targetPids = normalizePidSet(pids)
1083
+ if (targetPids.length === 0) {
1084
+ return null
399
1085
  }
400
- this.backoff("amd", 90000)
401
- return {
402
- provider: "amd-smi",
403
- processes: new Map(),
404
- error: lastError && lastError.message ? lastError.message : "amd-smi unavailable"
1086
+ try {
1087
+ const processes = await collectLinuxDrmFdinfoProcesses(targetPids, {
1088
+ procRoot: this.procRoot,
1089
+ maxPids: this.drmFdinfoMaxPids,
1090
+ maxFdsPerPid: this.drmFdinfoMaxFdsPerPid
1091
+ })
1092
+ if (processes.size === 0) {
1093
+ return null
1094
+ }
1095
+ return {
1096
+ provider: "linux-drm-fdinfo",
1097
+ processes,
1098
+ error: null
1099
+ }
1100
+ } catch (error) {
1101
+ this.logProviderFailure("linux-drm-fdinfo", error, pids, "Linux DRM fdinfo unavailable")
1102
+ this.backoff("linux-drm-fdinfo", 60000)
1103
+ return {
1104
+ provider: "linux-drm-fdinfo",
1105
+ processes: new Map(),
1106
+ error: error && error.message ? error.message : "Linux DRM fdinfo unavailable"
1107
+ }
405
1108
  }
406
1109
  }
407
1110
 
408
- async collect() {
409
- const results = []
1111
+ async collectNvml(pids) {
1112
+ if (this.platform !== "linux" || this.isBackedOff("linux-nvml")) {
1113
+ return null
1114
+ }
1115
+ try {
1116
+ return {
1117
+ provider: "linux-nvml",
1118
+ processes: this.getNvmlClient().collect(pids),
1119
+ error: null
1120
+ }
1121
+ } catch (error) {
1122
+ this.logProviderFailure("linux-nvml", error, pids, "Linux NVML unavailable")
1123
+ this.backoff("linux-nvml", 60000)
1124
+ return null
1125
+ }
1126
+ }
410
1127
 
411
- if (this.platform === "win32") {
412
- const windowsGpuProcessMemory = await this.collectWindowsGpuProcessMemory()
413
- if (windowsGpuProcessMemory) results.push(windowsGpuProcessMemory)
414
- } else {
415
- const nvidia = await this.collectNvidia()
416
- if (nvidia) results.push(nvidia)
1128
+ async collectAmdSmi(pids) {
1129
+ if (this.platform !== "linux" || this.isBackedOff("linux-amdsmi")) {
1130
+ return null
417
1131
  }
1132
+ try {
1133
+ return {
1134
+ provider: "linux-amdsmi",
1135
+ processes: this.getAmdSmiClient().collect(pids),
1136
+ error: null
1137
+ }
1138
+ } catch (error) {
1139
+ this.logProviderFailure("linux-amdsmi", error, pids, "Linux AMD SMI unavailable")
1140
+ this.backoff("linux-amdsmi", 60000)
1141
+ return null
1142
+ }
1143
+ }
418
1144
 
419
- const amd = await this.collectAmd()
420
- if (amd) results.push(amd)
1145
+ async collectRocmSmi(pids) {
1146
+ if (this.platform !== "linux" || this.isBackedOff("linux-rocm-smi")) {
1147
+ return null
1148
+ }
1149
+ try {
1150
+ return {
1151
+ provider: "linux-rocm-smi",
1152
+ processes: this.getRocmSmiClient().collect(pids),
1153
+ error: null
1154
+ }
1155
+ } catch (error) {
1156
+ this.logProviderFailure("linux-rocm-smi", error, pids, "Linux ROCm SMI unavailable")
1157
+ this.backoff("linux-rocm-smi", 60000)
1158
+ return null
1159
+ }
1160
+ }
421
1161
 
1162
+ mergeResults(results) {
422
1163
  const processes = new Map()
423
1164
  const providers = []
424
1165
  const errors = []
@@ -430,6 +1171,53 @@ class GpuSampler {
430
1171
  mergeGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
431
1172
  }
432
1173
  }
1174
+ return { processes, providers, errors }
1175
+ }
1176
+
1177
+ async collect(pids = null) {
1178
+ if (this.platform === "darwin") {
1179
+ return {
1180
+ available: false,
1181
+ stale: false,
1182
+ collectedAt: Date.now(),
1183
+ providers: [],
1184
+ processes: new Map(),
1185
+ errors: []
1186
+ }
1187
+ }
1188
+
1189
+ const results = []
1190
+ if (this.platform === "win32") {
1191
+ const windowsPdh = await this.collectWindowsPdh(pids)
1192
+ if (windowsPdh) results.push(windowsPdh)
1193
+ } else if (this.platform === "linux") {
1194
+ const linuxDrmFdinfo = await this.collectLinuxDrmFdinfo(pids)
1195
+ if (linuxDrmFdinfo) results.push(linuxDrmFdinfo)
1196
+
1197
+ let merged = this.mergeResults(results)
1198
+ const covered = coveredPids(merged.processes)
1199
+
1200
+ if (hasUncoveredTarget(pids, covered)) {
1201
+ const nvml = await this.collectNvml(pids)
1202
+ if (nvml) results.push(nvml)
1203
+ }
1204
+
1205
+ merged = this.mergeResults(results)
1206
+ const afterNvmlCovered = coveredPids(merged.processes)
1207
+ if (hasUncoveredTarget(pids, afterNvmlCovered)) {
1208
+ const amdSmi = await this.collectAmdSmi(pids)
1209
+ if (amdSmi) results.push(amdSmi)
1210
+ }
1211
+
1212
+ merged = this.mergeResults(results)
1213
+ const afterAmdCovered = coveredPids(merged.processes)
1214
+ if (hasUncoveredTarget(pids, afterAmdCovered)) {
1215
+ const rocmSmi = await this.collectRocmSmi(pids)
1216
+ if (rocmSmi) results.push(rocmSmi)
1217
+ }
1218
+ }
1219
+
1220
+ const { processes, providers, errors } = this.mergeResults(results)
433
1221
  return {
434
1222
  available: providers.length > 0 && errors.length < providers.length,
435
1223
  stale: false,
@@ -440,18 +1228,22 @@ class GpuSampler {
440
1228
  }
441
1229
  }
442
1230
 
443
- async getSnapshot() {
1231
+ async getSnapshot(pids = null) {
444
1232
  const now = Date.now()
445
- if (this.current && now - this.current.collectedAt < this.ttlMs) {
1233
+ const cacheKey = this.platform === "darwin" ? "" : normalizePidSet(pids).join(",")
1234
+ if (this.current && this.currentCacheKey === cacheKey && now - this.current.collectedAt < this.ttlMs) {
446
1235
  return this.current
447
1236
  }
448
- if (this.inFlight) {
1237
+ if (this.inFlight && this.inFlightCacheKey === cacheKey) {
449
1238
  return this.inFlight
450
1239
  }
451
- this.inFlight = this.collect().then((snapshot) => {
1240
+ this.inFlightCacheKey = cacheKey
1241
+ this.inFlight = this.collect(pids).then((snapshot) => {
452
1242
  this.current = snapshot
1243
+ this.currentCacheKey = cacheKey
453
1244
  return snapshot
454
1245
  }).catch((error) => {
1246
+ this.logProviderFailure("gpu", error, pids, "GPU sampling unavailable")
455
1247
  if (this.current) {
456
1248
  return { ...this.current, stale: true }
457
1249
  }
@@ -465,9 +1257,18 @@ class GpuSampler {
465
1257
  }
466
1258
  }).finally(() => {
467
1259
  this.inFlight = null
1260
+ this.inFlightCacheKey = null
468
1261
  })
469
1262
  return this.inFlight
470
1263
  }
1264
+
1265
+ stop() {
1266
+ for (const client of [this.windowsPdhClient, this.nvmlClient, this.amdSmiClient, this.rocmSmiClient]) {
1267
+ if (client && typeof client.stop === "function") {
1268
+ client.stop()
1269
+ }
1270
+ }
1271
+ }
471
1272
  }
472
1273
 
473
1274
  function sumGpuMemory(snapshot, pids) {
@@ -483,8 +1284,15 @@ function sumGpuMemory(snapshot, pids) {
483
1284
 
484
1285
  module.exports = {
485
1286
  GpuSampler,
486
- parseNvidiaCsv,
1287
+ WindowsPdhGpuMemoryClient,
1288
+ NvmlGpuMemoryClient,
1289
+ AmdSmiGpuMemoryClient,
1290
+ RocmSmiGpuMemoryClient,
487
1291
  parseMemoryToBytes,
488
- parseWindowsGpuProcessMemoryCsv,
1292
+ decodeWindowsMultiSz,
1293
+ extractPidFromWindowsGpuInstance,
1294
+ collectLinuxDrmFdinfoProcesses,
1295
+ isDedicatedDrmMemoryRegion,
1296
+ parseLinuxDrmFdinfo,
489
1297
  sumGpuMemory
490
1298
  }