pinokiod 7.3.4 → 7.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,31 @@
3
3
  const fs = require("fs")
4
4
  const os = require("os")
5
5
  const path = require("path")
6
- const { execFileText, normalizePid } = require("./process_tree")
6
+ const { normalizePid } = require("./process_tree")
7
7
 
8
- const DEFAULT_GPU_TTL_MS = 10000
9
- const DEFAULT_GPU_TIMEOUT_MS = 2500
8
+ const DEFAULT_GPU_TTL_MS = 5000
9
+ const DEFAULT_DRM_FDINFO_MAX_PIDS = 4096
10
+ const DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID = 1024
10
11
  const MIB = 1024 * 1024
11
12
 
13
+ const WINDOWS_GPU_PROCESS_COUNTER = "\\GPU Process Memory(*)\\Dedicated Usage"
14
+ const ERROR_SUCCESS = 0
15
+ const PDH_MORE_DATA = 0x800007D2
16
+ const PDH_INVALID_PATH = 0xC0000BC4
17
+ const PDH_INVALID_DATA = 0xC0000BC6
18
+ const PDH_NO_DATA = 0x800007D5
19
+ const PDH_FMT_LARGE = 0x00000400
20
+
21
+ const NVML_SUCCESS = 0
22
+ const NVML_ERROR_INSUFFICIENT_SIZE = 7
23
+ const NVML_VALUE_NOT_AVAILABLE = 0xFFFFFFFFFFFFFFFFn
24
+
25
+ const AMDSMI_INIT_AMD_GPUS = 1 << 1
26
+ const RSMI_INIT_DEFAULT = 0
27
+
28
+ let koffiModule
29
+ const koffiTypeCache = new WeakMap()
30
+
12
31
  function unique(values) {
13
32
  const seen = new Set()
14
33
  const next = []
@@ -20,54 +39,198 @@ function unique(values) {
20
39
  return next
21
40
  }
22
41
 
23
- function pathExists(filepath) {
42
+ function loadKoffi() {
43
+ if (koffiModule !== undefined) {
44
+ return koffiModule
45
+ }
24
46
  try {
25
- fs.accessSync(filepath, fs.constants.X_OK)
26
- return true
47
+ koffiModule = require("koffi")
27
48
  } catch (_) {
28
- return false
49
+ koffiModule = null
29
50
  }
51
+ return koffiModule
52
+ }
53
+
54
+ function getCachedKoffiTypes(koffi, key, factory) {
55
+ let cache = koffiTypeCache.get(koffi)
56
+ if (!cache) {
57
+ cache = new Map()
58
+ koffiTypeCache.set(koffi, cache)
59
+ }
60
+ if (!cache.has(key)) {
61
+ cache.set(key, factory())
62
+ }
63
+ return cache.get(key)
64
+ }
65
+
66
+ function getWindowsPdhTypes(koffi) {
67
+ return getCachedKoffiTypes(koffi, "windows-pdh", () => {
68
+ const counterValue = koffi.struct("PDH_FMT_COUNTERVALUE", {
69
+ CStatus: "uint32_t",
70
+ largeValue: "int64_t"
71
+ })
72
+ const counterInfo = koffi.struct("PDH_COUNTER_INFO_W_PREFIX", {
73
+ dwLength: "uint32_t",
74
+ dwType: "uint32_t",
75
+ CVersion: "uint32_t",
76
+ CStatus: "uint32_t",
77
+ lScale: "int32_t",
78
+ lDefaultScale: "int32_t",
79
+ dwUserData: "uintptr_t",
80
+ dwQueryUserData: "uintptr_t",
81
+ szFullPath: "str16"
82
+ })
83
+ return { counterValue, counterInfo }
84
+ })
85
+ }
86
+
87
+ function getNvmlTypes(koffi) {
88
+ return getCachedKoffiTypes(koffi, "nvml", () => {
89
+ const processInfoV1 = koffi.struct("nvmlProcessInfo_v1_t", {
90
+ pid: "uint32_t",
91
+ usedGpuMemory: "uint64_t"
92
+ })
93
+ const processInfoV2 = koffi.struct("nvmlProcessInfo_v2_t", {
94
+ pid: "uint32_t",
95
+ usedGpuMemory: "uint64_t",
96
+ gpuInstanceId: "uint32_t",
97
+ computeInstanceId: "uint32_t"
98
+ })
99
+ return { processInfoV1, processInfoV2 }
100
+ })
101
+ }
102
+
103
+ function getAmdSmiTypes(koffi) {
104
+ return getCachedKoffiTypes(koffi, "amdsmi", () => {
105
+ const engineUsage = koffi.struct("amdsmi_engine_usage_process_t", {
106
+ gfx: "uint64_t",
107
+ enc: "uint64_t",
108
+ reserved: koffi.array("uint32_t", 12)
109
+ })
110
+ const memoryUsage = koffi.struct("amdsmi_memory_usage_process_t", {
111
+ gtt_mem: "uint64_t",
112
+ cpu_mem: "uint64_t",
113
+ vram_mem: "uint64_t",
114
+ reserved: koffi.array("uint32_t", 10)
115
+ })
116
+ const procInfo = koffi.struct("amdsmi_proc_info_t", {
117
+ name: koffi.array("char", 256),
118
+ pid: "uint32_t",
119
+ mem: "uint64_t",
120
+ engine_usage: engineUsage,
121
+ memory_usage: memoryUsage,
122
+ container_name: koffi.array("char", 256),
123
+ cu_occupancy: "uint32_t",
124
+ evicted_time: "uint32_t",
125
+ reserved: koffi.array("uint32_t", 10)
126
+ })
127
+ return { procInfo }
128
+ })
129
+ }
130
+
131
+ function getRocmSmiTypes(koffi) {
132
+ return getCachedKoffiTypes(koffi, "rocm-smi", () => {
133
+ const procInfo = koffi.struct("rsmi_process_info_t", {
134
+ process_id: "uint32_t",
135
+ pasid: "uint32_t",
136
+ vram_usage: "uint64_t",
137
+ sdma_usage: "uint64_t",
138
+ cu_occupancy: "uint32_t"
139
+ })
140
+ return { procInfo }
141
+ })
30
142
  }
31
143
 
32
- function executableCandidates(candidates) {
144
+ function existingLibraryCandidates(candidates) {
33
145
  return unique(candidates).filter((candidate) => {
34
146
  if (!candidate) return false
35
- if (path.isAbsolute(candidate)) {
36
- return pathExists(candidate)
147
+ if (!path.isAbsolute(candidate)) return true
148
+ try {
149
+ return fs.existsSync(candidate)
150
+ } catch (_) {
151
+ return false
37
152
  }
38
- return true
39
153
  })
40
154
  }
41
155
 
42
- function getPinokioCondaCandidates(kernel, names) {
43
- if (!kernel || !kernel.homedir) {
44
- return []
156
+ function rocmLibraryCandidates(filename) {
157
+ const roots = unique([
158
+ process.env.ROCM_PATH,
159
+ process.env.ROCM_HOME,
160
+ "/opt/rocm",
161
+ "/usr",
162
+ "/usr/local"
163
+ ])
164
+ const candidates = [filename]
165
+ for (const root of roots) {
166
+ candidates.push(
167
+ path.join(root, "lib", filename),
168
+ path.join(root, "lib64", filename)
169
+ )
45
170
  }
46
- const prefix = path.resolve(kernel.homedir, "bin", "miniconda")
47
- const suffixes = os.platform() === "win32"
48
- ? ["", ".exe"]
49
- : [""]
50
- const folders = os.platform() === "win32"
51
- ? ["Library/bin", "Scripts", ""]
52
- : ["bin", "Library/bin", ""]
53
- const candidates = []
54
- for (const name of names) {
55
- for (const folder of folders) {
56
- for (const suffix of suffixes) {
57
- candidates.push(path.resolve(prefix, folder, `${name}${suffix}`))
58
- }
171
+ candidates.push(
172
+ path.join("/usr/lib/x86_64-linux-gnu", filename),
173
+ path.join("/usr/lib/aarch64-linux-gnu", filename),
174
+ path.join("/usr/local/lib", filename)
175
+ )
176
+ return existingLibraryCandidates(candidates)
177
+ }
178
+
179
+ function loadFirstLibrary(koffi, candidates, options = {}) {
180
+ let lastError = null
181
+ for (const candidate of existingLibraryCandidates(candidates)) {
182
+ try {
183
+ return koffi.load(candidate, options)
184
+ } catch (error) {
185
+ lastError = error
59
186
  }
60
187
  }
61
- return candidates
188
+ throw lastError || new Error("native GPU library unavailable")
189
+ }
190
+
191
+ function optionalFunction(library, definitions) {
192
+ for (const definition of definitions) {
193
+ try {
194
+ return library.func(definition)
195
+ } catch (_) {}
196
+ }
197
+ return null
198
+ }
199
+
200
+ function statusCode(value) {
201
+ return Number(value) >>> 0
202
+ }
203
+
204
+ function isStatus(value, expected) {
205
+ return statusCode(value) === (expected >>> 0)
206
+ }
207
+
208
+ function isSuccess(value) {
209
+ return isStatus(value, ERROR_SUCCESS)
210
+ }
211
+
212
+ function isNoDataStatus(value) {
213
+ return isStatus(value, PDH_INVALID_PATH) || isStatus(value, PDH_INVALID_DATA) || isStatus(value, PDH_NO_DATA)
214
+ }
215
+
216
+ function toSafeNumber(value) {
217
+ if (typeof value === "bigint") {
218
+ if (value < 0n || value > BigInt(Number.MAX_SAFE_INTEGER)) return null
219
+ return Number(value)
220
+ }
221
+ const number = Number(value)
222
+ if (!Number.isFinite(number) || number < 0) return null
223
+ return number
62
224
  }
63
225
 
64
226
  function parseMemoryToBytes(value, defaultUnit = "") {
65
227
  if (value == null) return null
66
- if (typeof value === "number") {
67
- if (!Number.isFinite(value) || value < 0) return null
68
- if (defaultUnit === "mib") return Math.round(value * MIB)
69
- if (defaultUnit === "kb") return Math.round(value * 1024)
70
- return Math.round(value)
228
+ if (typeof value === "number" || typeof value === "bigint") {
229
+ const number = toSafeNumber(value)
230
+ if (number == null) return null
231
+ if (defaultUnit === "mib") return Math.round(number * MIB)
232
+ if (defaultUnit === "kb") return Math.round(number * 1024)
233
+ return Math.round(number)
71
234
  }
72
235
  const raw = String(value).trim()
73
236
  if (!raw || /N\/A|not supported|none/i.test(raw)) {
@@ -98,108 +261,739 @@ function addGpuProcess(processes, pid, bytes) {
98
261
  processes.set(normalizedPid, current)
99
262
  }
100
263
 
101
- function parseNvidiaCsv(stdout) {
102
- const processes = new Map()
264
+ function mergeGpuProcess(processes, pid, bytes) {
265
+ const normalizedPid = normalizePid(pid)
266
+ if (!normalizedPid || !Number.isFinite(bytes) || bytes < 0) {
267
+ return
268
+ }
269
+ const current = processes.get(normalizedPid) || {
270
+ pid: normalizedPid,
271
+ usedGpuMemoryBytes: 0
272
+ }
273
+ current.usedGpuMemoryBytes = Math.max(current.usedGpuMemoryBytes || 0, bytes)
274
+ processes.set(normalizedPid, current)
275
+ }
276
+
277
+ function normalizePidSet(values) {
278
+ const pids = []
279
+ for (const value of values || []) {
280
+ const pid = normalizePid(value)
281
+ if (pid) pids.push(pid)
282
+ }
283
+ return Array.from(new Set(pids)).sort((a, b) => a - b)
284
+ }
285
+
286
+ function filterProcessMap(processes, pids) {
287
+ const targetPids = normalizePidSet(pids)
288
+ if (targetPids.length === 0 && pids != null) {
289
+ return new Map()
290
+ }
291
+ if (targetPids.length === 0) {
292
+ return processes
293
+ }
294
+ const targetSet = new Set(targetPids)
295
+ const filtered = new Map()
296
+ for (const entry of processes.values()) {
297
+ if (targetSet.has(entry.pid)) {
298
+ filtered.set(entry.pid, entry)
299
+ }
300
+ }
301
+ return filtered
302
+ }
303
+
304
+ function coveredPids(processes) {
305
+ return new Set(Array.from(processes.keys()))
306
+ }
307
+
308
+ function hasUncoveredTarget(pids, covered) {
309
+ const targetPids = normalizePidSet(pids)
310
+ if (pids == null) return true
311
+ if (targetPids.length === 0) return false
312
+ for (const pid of targetPids) {
313
+ if (!covered.has(pid)) return true
314
+ }
315
+ return false
316
+ }
317
+
318
+ function extractPidFromWindowsGpuInstance(instanceName) {
319
+ const match = /(?:^|[^a-z0-9])pid[_\s-]*(\d+)(?:\D|$)/i.exec(String(instanceName || ""))
320
+ return normalizePid(match && match[1])
321
+ }
322
+
323
+ function decodeWindowsMultiSz(buffer, charCount) {
324
+ const values = []
325
+ let start = 0
326
+ const count = Math.max(0, Math.min(charCount || 0, Math.floor(buffer.length / 2)))
327
+ for (let i = 0; i < count; i += 1) {
328
+ const char = buffer.readUInt16LE(i * 2)
329
+ if (char !== 0) continue
330
+ if (i === start) break
331
+ values.push(buffer.subarray(start * 2, i * 2).toString("utf16le"))
332
+ start = i + 1
333
+ }
334
+ return values.filter(Boolean)
335
+ }
336
+
337
+ function isDedicatedDrmMemoryRegion(region) {
338
+ const normalized = String(region || "")
339
+ .trim()
340
+ .toLowerCase()
341
+ .replace(/[_\s]+/g, "-")
342
+ const compact = normalized.replace(/[^a-z0-9]/g, "")
343
+ if (!compact || /^(system|gtt|memory|shared|stolen|cpu|host)\d*$/.test(compact)) {
344
+ return false
345
+ }
346
+ return /^vram\d*$/.test(compact) || /^local\d*$/.test(compact)
347
+ }
348
+
349
+ function parseLinuxDrmFdinfo(stdout) {
350
+ const fields = new Map()
103
351
  for (const line of String(stdout || "").split(/\r?\n/)) {
104
- const trimmed = line.trim()
105
- if (!trimmed) continue
106
- const parts = trimmed.split(",").map((part) => part.trim())
107
- const pid = normalizePid(parts[0])
108
- const bytes = parseMemoryToBytes(parts[1], "mib")
109
- addGpuProcess(processes, pid, bytes)
352
+ const separator = line.indexOf(":")
353
+ if (separator < 0) continue
354
+ const key = line.slice(0, separator).trim().toLowerCase()
355
+ const value = line.slice(separator + 1).trim()
356
+ if (key) fields.set(key, value)
357
+ }
358
+
359
+ const driver = fields.get("drm-driver")
360
+ if (!driver) {
361
+ return null
362
+ }
363
+
364
+ let residentBytes = 0
365
+ let legacyMemoryBytes = 0
366
+ let hasResidentDedicatedMemory = false
367
+ for (const [key, value] of fields.entries()) {
368
+ const match = /^drm-(resident|memory)-(.+)$/.exec(key)
369
+ if (!match || !isDedicatedDrmMemoryRegion(match[2])) continue
370
+ const bytes = parseMemoryToBytes(value)
371
+ if (!Number.isFinite(bytes) || bytes < 0) continue
372
+ if (match[1] === "resident") {
373
+ hasResidentDedicatedMemory = true
374
+ residentBytes += bytes
375
+ } else {
376
+ legacyMemoryBytes += bytes
377
+ }
378
+ }
379
+
380
+ return {
381
+ driver,
382
+ pdev: fields.get("drm-pdev") || "",
383
+ clientId: fields.get("drm-client-id") || "",
384
+ dedicatedBytes: hasResidentDedicatedMemory ? residentBytes : legacyMemoryBytes
385
+ }
386
+ }
387
+
388
+ async function collectLinuxDrmFdinfoProcesses(pids, options = {}) {
389
+ const procRoot = options.procRoot || "/proc"
390
+ const maxPids = options.maxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
391
+ const maxFdsPerPid = options.maxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
392
+ const targetPids = normalizePidSet(pids).slice(0, maxPids)
393
+ const byClient = new Map()
394
+
395
+ for (const pid of targetPids) {
396
+ const fdinfoDir = path.join(procRoot, String(pid), "fdinfo")
397
+ let entries = []
398
+ try {
399
+ entries = await fs.promises.readdir(fdinfoDir, { withFileTypes: true })
400
+ } catch (_) {
401
+ continue
402
+ }
403
+
404
+ let scannedFds = 0
405
+ for (const entry of entries) {
406
+ const name = entry && entry.name ? entry.name : ""
407
+ if (!/^\d+$/.test(name)) continue
408
+ scannedFds += 1
409
+ if (scannedFds > maxFdsPerPid) break
410
+
411
+ let stdout = ""
412
+ try {
413
+ stdout = await fs.promises.readFile(path.join(fdinfoDir, name), "utf8")
414
+ } catch (_) {
415
+ continue
416
+ }
417
+
418
+ const parsed = parseLinuxDrmFdinfo(stdout)
419
+ if (!parsed || !(parsed.dedicatedBytes > 0)) continue
420
+ const clientKey = parsed.clientId
421
+ ? `client:${parsed.clientId}`
422
+ : "unknown-client"
423
+ const key = [
424
+ pid,
425
+ parsed.driver || "unknown-driver",
426
+ parsed.pdev || "unknown-device",
427
+ clientKey
428
+ ].join(":")
429
+ const current = byClient.get(key)
430
+ byClient.set(key, {
431
+ pid,
432
+ bytes: current ? Math.max(current.bytes, parsed.dedicatedBytes) : parsed.dedicatedBytes
433
+ })
434
+ }
435
+ }
436
+
437
+ const processes = new Map()
438
+ for (const entry of byClient.values()) {
439
+ addGpuProcess(processes, entry.pid, entry.bytes)
110
440
  }
111
441
  return processes
112
442
  }
113
443
 
114
- function findObjectValue(object, predicate) {
115
- if (!object || typeof object !== "object" || Array.isArray(object)) {
444
+ class WindowsPdhGpuMemoryClient {
445
+ constructor(options = {}) {
446
+ this.koffi = options.koffi || loadKoffi()
447
+ this.library = null
448
+ this.query = null
449
+ this.counters = []
450
+ this.counterValueType = null
451
+ this.counterInfoType = null
452
+ this.functions = null
453
+ this.counterRefreshMs = options.counterRefreshMs || DEFAULT_GPU_TTL_MS
454
+ this.lastCounterRefreshAt = 0
455
+ }
456
+
457
+ init() {
458
+ if (this.functions) return
459
+ if (!this.koffi) {
460
+ throw new Error("koffi unavailable")
461
+ }
462
+
463
+ const types = getWindowsPdhTypes(this.koffi)
464
+ this.counterValueType = types.counterValue
465
+ this.counterInfoType = types.counterInfo
466
+
467
+ this.library = this.koffi.load("pdh.dll")
468
+ this.functions = {
469
+ openQuery: this.library.func("uint32_t __stdcall PdhOpenQueryW(const char16_t *szDataSource, uintptr_t dwUserData, _Out_ void **phQuery)"),
470
+ addEnglishCounter: this.library.func("uint32_t __stdcall PdhAddEnglishCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
471
+ addCounter: this.library.func("uint32_t __stdcall PdhAddCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
472
+ collectQueryData: this.library.func("uint32_t __stdcall PdhCollectQueryData(void *hQuery)"),
473
+ getCounterInfo: this.library.func("uint32_t __stdcall PdhGetCounterInfoW(void *hCounter, int bRetrieveExplainText, _Inout_ uint32_t *pdwBufferSize, _Out_ void *lpBuffer)"),
474
+ expandWildCardPath: this.library.func("uint32_t __stdcall PdhExpandWildCardPathW(const char16_t *szDataSource, const char16_t *szWildCardPath, _Out_ char16_t *mszExpandedPathList, _Inout_ uint32_t *pcchPathListLength, uint32_t dwFlags)"),
475
+ getFormattedCounterValue: this.library.func("uint32_t __stdcall PdhGetFormattedCounterValueW(void *hCounter, uint32_t dwFormat, _Out_ uint32_t *lpdwType, _Out_ PDH_FMT_COUNTERVALUE *pValue)"),
476
+ closeQuery: this.library.func("uint32_t __stdcall PdhCloseQuery(void *hQuery)")
477
+ }
478
+ }
479
+
480
+ openQuery() {
481
+ const query = [null]
482
+ const status = this.functions.openQuery(null, 0, query)
483
+ if (!isSuccess(status)) {
484
+ throw new Error(`PdhOpenQueryW failed: 0x${statusCode(status).toString(16)}`)
485
+ }
486
+ return query[0]
487
+ }
488
+
489
+ closeQuery(query) {
490
+ if (!query || !this.functions) return
491
+ try {
492
+ this.functions.closeQuery(query)
493
+ } catch (_) {}
494
+ }
495
+
496
+ getLocalizedWildcardPath() {
497
+ const query = this.openQuery()
498
+ const counter = [null]
499
+ try {
500
+ let status = this.functions.addEnglishCounter(query, WINDOWS_GPU_PROCESS_COUNTER, 0, counter)
501
+ if (!isSuccess(status)) {
502
+ throw new Error(`PdhAddEnglishCounterW failed: 0x${statusCode(status).toString(16)}`)
503
+ }
504
+
505
+ const bufferSize = [0]
506
+ status = this.functions.getCounterInfo(counter[0], 0, bufferSize, null)
507
+ if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
508
+ throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
509
+ }
510
+ if (bufferSize[0] <= 0) {
511
+ return WINDOWS_GPU_PROCESS_COUNTER
512
+ }
513
+
514
+ const buffer = Buffer.alloc(bufferSize[0])
515
+ status = this.functions.getCounterInfo(counter[0], 0, bufferSize, buffer)
516
+ if (!isSuccess(status)) {
517
+ throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
518
+ }
519
+
520
+ const info = this.koffi.decode(buffer, this.counterInfoType)
521
+ return info && info.szFullPath ? info.szFullPath : WINDOWS_GPU_PROCESS_COUNTER
522
+ } finally {
523
+ this.closeQuery(query)
524
+ }
525
+ }
526
+
527
+ expandWildcardPath(wildcardPath) {
528
+ const charCount = [0]
529
+ let status = this.functions.expandWildCardPath(null, wildcardPath, null, charCount, 0)
530
+ if (isNoDataStatus(status)) {
531
+ return []
532
+ }
533
+ if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
534
+ throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
535
+ }
536
+ if (charCount[0] <= 0) {
537
+ return []
538
+ }
539
+
540
+ const buffer = Buffer.alloc(charCount[0] * 2)
541
+ status = this.functions.expandWildCardPath(null, wildcardPath, buffer, charCount, 0)
542
+ if (isNoDataStatus(status)) {
543
+ return []
544
+ }
545
+ if (!isSuccess(status)) {
546
+ throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
547
+ }
548
+ return decodeWindowsMultiSz(buffer, charCount[0])
549
+ }
550
+
551
+ refreshCounters(force = false) {
552
+ const now = Date.now()
553
+ if (!force && this.query && now - this.lastCounterRefreshAt < this.counterRefreshMs) {
554
+ return
555
+ }
556
+
557
+ const paths = this.expandWildcardPath(this.getLocalizedWildcardPath())
558
+ const query = this.openQuery()
559
+ const counters = []
560
+ try {
561
+ for (const counterPath of paths) {
562
+ const pid = extractPidFromWindowsGpuInstance(counterPath)
563
+ if (!pid) continue
564
+ const counter = [null]
565
+ const status = this.functions.addCounter(query, counterPath, 0, counter)
566
+ if (isSuccess(status) && counter[0]) {
567
+ counters.push({ handle: counter[0], pid })
568
+ }
569
+ }
570
+ } catch (error) {
571
+ this.closeQuery(query)
572
+ throw error
573
+ }
574
+
575
+ const previousQuery = this.query
576
+ this.query = counters.length > 0 ? query : null
577
+ this.counters = counters
578
+ this.lastCounterRefreshAt = now
579
+ if (this.query !== query) {
580
+ this.closeQuery(query)
581
+ }
582
+ this.closeQuery(previousQuery)
583
+ }
584
+
585
+ readCounterValue(counter) {
586
+ const type = [0]
587
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.counterValueType))
588
+ const status = this.functions.getFormattedCounterValue(counter.handle, PDH_FMT_LARGE, type, buffer)
589
+ if (isNoDataStatus(status)) {
590
+ return null
591
+ }
592
+ if (!isSuccess(status)) {
593
+ return null
594
+ }
595
+ const value = this.koffi.decode(buffer, this.counterValueType)
596
+ if (!value || !isSuccess(value.CStatus)) {
597
+ return null
598
+ }
599
+ return parseMemoryToBytes(value.largeValue)
600
+ }
601
+
602
+ collect(pids) {
603
+ this.init()
604
+ this.refreshCounters(false)
605
+ if (!this.query || this.counters.length === 0) {
606
+ return new Map()
607
+ }
608
+
609
+ const status = this.functions.collectQueryData(this.query)
610
+ if (isNoDataStatus(status)) {
611
+ return new Map()
612
+ }
613
+ if (!isSuccess(status)) {
614
+ throw new Error(`PdhCollectQueryData failed: 0x${statusCode(status).toString(16)}`)
615
+ }
616
+
617
+ const targetPids = normalizePidSet(pids)
618
+ const targetSet = targetPids.length > 0 ? new Set(targetPids) : null
619
+ const processes = new Map()
620
+ for (const counter of this.counters) {
621
+ if (!counter || (targetSet && !targetSet.has(counter.pid))) continue
622
+ addGpuProcess(processes, counter.pid, this.readCounterValue(counter))
623
+ }
624
+ return processes
625
+ }
626
+
627
+ stop() {
628
+ this.closeQuery(this.query)
629
+ this.query = null
630
+ this.counters = []
631
+ }
632
+ }
633
+
634
+ class NvmlGpuMemoryClient {
635
+ constructor(options = {}) {
636
+ this.koffi = options.koffi || loadKoffi()
637
+ this.library = null
638
+ this.initialized = false
639
+ this.processInfoV1 = null
640
+ this.processInfoV2 = null
641
+ this.functions = null
642
+ }
643
+
644
+ init() {
645
+ if (this.initialized) return
646
+ if (!this.koffi) {
647
+ throw new Error("koffi unavailable")
648
+ }
649
+
650
+ const types = getNvmlTypes(this.koffi)
651
+ this.processInfoV1 = types.processInfoV1
652
+ this.processInfoV2 = types.processInfoV2
653
+
654
+ this.library = loadFirstLibrary(this.koffi, [
655
+ process.env.NVIDIA_ML,
656
+ "libnvidia-ml.so.1",
657
+ "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
658
+ "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
659
+ "/usr/lib64/libnvidia-ml.so.1",
660
+ "/usr/local/nvidia/lib64/libnvidia-ml.so.1"
661
+ ])
662
+ this.functions = {
663
+ init: optionalFunction(this.library, [
664
+ "int nvmlInit_v2(void)",
665
+ "int nvmlInit(void)"
666
+ ]),
667
+ shutdown: optionalFunction(this.library, [
668
+ "int nvmlShutdown(void)"
669
+ ]),
670
+ getCount: optionalFunction(this.library, [
671
+ "int nvmlDeviceGetCount_v2(_Out_ uint32_t *deviceCount)",
672
+ "int nvmlDeviceGetCount(_Out_ uint32_t *deviceCount)"
673
+ ]),
674
+ getHandleByIndex: optionalFunction(this.library, [
675
+ "int nvmlDeviceGetHandleByIndex_v2(uint32_t index, _Out_ void **device)",
676
+ "int nvmlDeviceGetHandleByIndex(uint32_t index, _Out_ void **device)"
677
+ ]),
678
+ compute: this.pickProcessFunction("nvmlDeviceGetComputeRunningProcesses"),
679
+ graphics: this.pickProcessFunction("nvmlDeviceGetGraphicsRunningProcesses"),
680
+ mps: this.pickProcessFunction("nvmlDeviceGetMPSComputeRunningProcesses")
681
+ }
682
+
683
+ if (!this.functions.init || !this.functions.getCount || !this.functions.getHandleByIndex) {
684
+ throw new Error("NVML process API unavailable")
685
+ }
686
+ const status = this.functions.init()
687
+ if (status !== NVML_SUCCESS) {
688
+ throw new Error(`nvmlInit failed: ${status}`)
689
+ }
690
+ this.initialized = true
691
+ }
692
+
693
+ pickProcessFunction(baseName) {
694
+ const candidates = [
695
+ { suffix: "_v3", type: () => this.processInfoV2 },
696
+ { suffix: "_v2", type: () => this.processInfoV2 },
697
+ { suffix: "", type: () => this.processInfoV1 }
698
+ ]
699
+ for (const candidate of candidates) {
700
+ const typeName = candidate.type() === this.processInfoV2 ? "nvmlProcessInfo_v2_t" : "nvmlProcessInfo_v1_t"
701
+ const func = optionalFunction(this.library, [
702
+ `int ${baseName}${candidate.suffix}(void *device, _Inout_ uint32_t *infoCount, _Out_ ${typeName} *infos)`
703
+ ])
704
+ if (func) {
705
+ return { func, type: candidate.type() }
706
+ }
707
+ }
116
708
  return null
117
709
  }
118
- for (const [key, value] of Object.entries(object)) {
119
- if (predicate(key, value)) {
120
- return value
710
+
711
+ getDeviceHandles() {
712
+ const count = [0]
713
+ const status = this.functions.getCount(count)
714
+ if (status !== NVML_SUCCESS) {
715
+ throw new Error(`nvmlDeviceGetCount failed: ${status}`)
121
716
  }
717
+ const handles = []
718
+ for (let i = 0; i < count[0]; i += 1) {
719
+ const handle = [null]
720
+ const handleStatus = this.functions.getHandleByIndex(i, handle)
721
+ if (handleStatus === NVML_SUCCESS && handle[0]) {
722
+ handles.push(handle[0])
723
+ }
724
+ }
725
+ return handles
726
+ }
727
+
728
+ collectProcessList(device, entry) {
729
+ if (!entry || !entry.func) return []
730
+
731
+ let count = [0]
732
+ let status = entry.func(device, count, null)
733
+ if (status === NVML_SUCCESS && count[0] === 0) {
734
+ return []
735
+ }
736
+ if (status !== NVML_SUCCESS && status !== NVML_ERROR_INSUFFICIENT_SIZE) {
737
+ return []
738
+ }
739
+
740
+ let capacity = Math.max(1, count[0] + 8)
741
+ for (let attempt = 0; attempt < 2; attempt += 1) {
742
+ count = [capacity]
743
+ const buffer = Buffer.alloc(this.koffi.sizeof(entry.type) * capacity)
744
+ status = entry.func(device, count, buffer)
745
+ if (status === NVML_SUCCESS) {
746
+ return this.koffi.decode(buffer, entry.type, Math.min(count[0], capacity))
747
+ }
748
+ if (status !== NVML_ERROR_INSUFFICIENT_SIZE || count[0] <= capacity) {
749
+ return []
750
+ }
751
+ capacity = count[0] + 8
752
+ }
753
+ return []
754
+ }
755
+
756
+ collect(pids = null) {
757
+ this.init()
758
+ const processes = new Map()
759
+ for (const device of this.getDeviceHandles()) {
760
+ const deviceProcesses = new Map()
761
+ for (const entry of [this.functions.compute, this.functions.graphics, this.functions.mps]) {
762
+ for (const processInfo of this.collectProcessList(device, entry)) {
763
+ if (!processInfo) continue
764
+ const pid = normalizePid(processInfo.pid)
765
+ if (!pid) continue
766
+ if (typeof processInfo.usedGpuMemory === "bigint" && processInfo.usedGpuMemory === NVML_VALUE_NOT_AVAILABLE) {
767
+ continue
768
+ }
769
+ const bytes = parseMemoryToBytes(processInfo.usedGpuMemory)
770
+ mergeGpuProcess(deviceProcesses, pid, bytes)
771
+ }
772
+ }
773
+ for (const entry of deviceProcesses.values()) {
774
+ addGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
775
+ }
776
+ }
777
+ return filterProcessMap(processes, pids)
778
+ }
779
+
780
+ stop() {
781
+ if (this.initialized && this.functions && this.functions.shutdown) {
782
+ try {
783
+ this.functions.shutdown()
784
+ } catch (_) {}
785
+ }
786
+ this.initialized = false
122
787
  }
123
- return null
124
788
  }
125
789
 
126
- function extractAmdProcessesFromJson(value, processes = new Map()) {
127
- if (Array.isArray(value)) {
128
- for (const item of value) {
129
- extractAmdProcessesFromJson(item, processes)
790
+ class AmdSmiGpuMemoryClient {
791
+ constructor(options = {}) {
792
+ this.koffi = options.koffi || loadKoffi()
793
+ this.library = null
794
+ this.initialized = false
795
+ this.procInfoType = null
796
+ this.functions = null
797
+ }
798
+
799
+ init() {
800
+ if (this.initialized) return
801
+ if (!this.koffi) {
802
+ throw new Error("koffi unavailable")
130
803
  }
131
- return processes
804
+
805
+ this.procInfoType = getAmdSmiTypes(this.koffi).procInfo
806
+
807
+ this.library = loadFirstLibrary(this.koffi, [
808
+ process.env.AMD_SMI_LIBRARY,
809
+ ...rocmLibraryCandidates("libamd_smi.so")
810
+ ])
811
+ this.functions = {
812
+ init: this.library.func("int amdsmi_init(uint64_t init_flags)"),
813
+ shutdown: optionalFunction(this.library, [
814
+ "int amdsmi_shut_down(void)"
815
+ ]),
816
+ getSocketHandles: this.library.func("int amdsmi_get_socket_handles(_Inout_ uint32_t *socket_count, _Out_ void **socket_handles)"),
817
+ getProcessorHandles: this.library.func("int amdsmi_get_processor_handles(void *socket_handle, _Inout_ uint32_t *processor_count, _Out_ void **processor_handles)"),
818
+ getProcessList: this.library.func("int amdsmi_get_gpu_process_list(void *processor_handle, _Inout_ uint32_t *max_processes, _Out_ amdsmi_proc_info_t *list)")
819
+ }
820
+
821
+ const status = this.functions.init(AMDSMI_INIT_AMD_GPUS)
822
+ if (status !== 0) {
823
+ throw new Error(`amdsmi_init failed: ${status}`)
824
+ }
825
+ this.initialized = true
132
826
  }
133
- if (!value || typeof value !== "object") {
134
- return processes
827
+
828
+ readPointerArray(countFunction) {
829
+ let count = [0]
830
+ let status = countFunction(count, null)
831
+ if (status !== 0 && count[0] === 0) {
832
+ return []
833
+ }
834
+ if (count[0] <= 0) {
835
+ return []
836
+ }
837
+ const pointerSize = this.koffi.sizeof("void *")
838
+ const buffer = Buffer.alloc(pointerSize * count[0])
839
+ status = countFunction(count, buffer)
840
+ if (status !== 0) {
841
+ return []
842
+ }
843
+ return this.koffi.decode(buffer, "uintptr_t", count[0]).filter(Boolean)
135
844
  }
136
845
 
137
- const pidValue = findObjectValue(value, (key) => /(^|[_\s-])pid$|process[_\s-]*id/i.test(key))
138
- const memoryValue = findObjectValue(value, (key) => {
139
- const normalized = key.toLowerCase()
140
- if (/total|free|available|limit/.test(normalized)) return false
141
- return /vram|memory/.test(normalized) && /usage|used|mem|size/.test(normalized)
142
- })
143
- const pid = normalizePid(pidValue)
144
- const bytes = parseMemoryToBytes(memoryValue)
145
- addGpuProcess(processes, pid, bytes)
846
+ getProcessorHandles() {
847
+ const sockets = this.readPointerArray((count, buffer) => {
848
+ return this.functions.getSocketHandles(count, buffer)
849
+ })
850
+ const processors = []
851
+ for (const socket of sockets) {
852
+ processors.push(...this.readPointerArray((count, buffer) => {
853
+ return this.functions.getProcessorHandles(socket, count, buffer)
854
+ }))
855
+ }
856
+ return processors
857
+ }
858
+
859
+ collectProcessorProcesses(processor) {
860
+ let count = [0]
861
+ let status = this.functions.getProcessList(processor, count, null)
862
+ if (status !== 0 && count[0] === 0) {
863
+ return []
864
+ }
865
+ if (count[0] <= 0) {
866
+ return []
867
+ }
146
868
 
147
- for (const child of Object.values(value)) {
148
- if (child && typeof child === "object") {
149
- extractAmdProcessesFromJson(child, processes)
869
+ let capacity = count[0]
870
+ for (let attempt = 0; attempt < 2; attempt += 1) {
871
+ count = [capacity]
872
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * capacity)
873
+ status = this.functions.getProcessList(processor, count, buffer)
874
+ if (status === 0) {
875
+ return this.koffi.decode(buffer, this.procInfoType, Math.min(count[0], capacity))
876
+ }
877
+ if (count[0] <= capacity) {
878
+ return []
879
+ }
880
+ capacity = count[0]
150
881
  }
882
+ return []
883
+ }
884
+
885
+ collect(pids = null) {
886
+ this.init()
887
+ const processes = new Map()
888
+ for (const processor of this.getProcessorHandles()) {
889
+ for (const entry of this.collectProcessorProcesses(processor)) {
890
+ if (!entry) continue
891
+ const bytes = parseMemoryToBytes(entry.memory_usage && entry.memory_usage.vram_mem)
892
+ addGpuProcess(processes, entry.pid, bytes)
893
+ }
894
+ }
895
+ return filterProcessMap(processes, pids)
896
+ }
897
+
898
+ stop() {
899
+ if (this.initialized && this.functions && this.functions.shutdown) {
900
+ try {
901
+ this.functions.shutdown()
902
+ } catch (_) {}
903
+ }
904
+ this.initialized = false
151
905
  }
152
- return processes
153
906
  }
154
907
 
155
- function parseAmdJson(stdout) {
156
- const parsed = JSON.parse(stdout || "[]")
157
- return extractAmdProcessesFromJson(parsed)
908
+ class RocmSmiGpuMemoryClient {
909
+ constructor(options = {}) {
910
+ this.koffi = options.koffi || loadKoffi()
911
+ this.library = null
912
+ this.initialized = false
913
+ this.procInfoType = null
914
+ this.functions = null
915
+ }
916
+
917
+ init() {
918
+ if (this.initialized) return
919
+ if (!this.koffi) {
920
+ throw new Error("koffi unavailable")
921
+ }
922
+
923
+ this.procInfoType = getRocmSmiTypes(this.koffi).procInfo
924
+
925
+ this.library = loadFirstLibrary(this.koffi, [
926
+ process.env.ROCM_SMI_LIBRARY,
927
+ ...rocmLibraryCandidates("librocm_smi64.so")
928
+ ])
929
+ this.functions = {
930
+ init: this.library.func("int rsmi_init(uint64_t init_flags)"),
931
+ shutdown: optionalFunction(this.library, [
932
+ "int rsmi_shut_down(void)"
933
+ ]),
934
+ getProcessInfo: this.library.func("int rsmi_compute_process_info_get(_Out_ rsmi_process_info_t *procs, _Inout_ uint32_t *num_items)")
935
+ }
936
+
937
+ const status = this.functions.init(RSMI_INIT_DEFAULT)
938
+ if (status !== 0) {
939
+ throw new Error(`rsmi_init failed: ${status}`)
940
+ }
941
+ this.initialized = true
942
+ }
943
+
944
+ collect(pids = null) {
945
+ this.init()
946
+ let count = [0]
947
+ let status = this.functions.getProcessInfo(null, count)
948
+ if (status !== 0 && count[0] === 0) {
949
+ return new Map()
950
+ }
951
+ if (count[0] <= 0) {
952
+ return new Map()
953
+ }
954
+
955
+ const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * count[0])
956
+ status = this.functions.getProcessInfo(buffer, count)
957
+ if (status !== 0) {
958
+ return new Map()
959
+ }
960
+
961
+ const processes = new Map()
962
+ for (const entry of this.koffi.decode(buffer, this.procInfoType, count[0])) {
963
+ if (!entry) continue
964
+ addGpuProcess(processes, entry.process_id, parseMemoryToBytes(entry.vram_usage))
965
+ }
966
+ return filterProcessMap(processes, pids)
967
+ }
968
+
969
+ stop() {
970
+ if (this.initialized && this.functions && this.functions.shutdown) {
971
+ try {
972
+ this.functions.shutdown()
973
+ } catch (_) {}
974
+ }
975
+ this.initialized = false
976
+ }
158
977
  }
159
978
 
160
979
  class GpuSampler {
161
980
  constructor(options = {}) {
162
981
  this.kernel = options.kernel || null
982
+ this.platform = options.platform || (this.kernel && this.kernel.platform) || os.platform()
163
983
  this.ttlMs = options.ttlMs || DEFAULT_GPU_TTL_MS
164
- this.timeoutMs = options.timeoutMs || DEFAULT_GPU_TIMEOUT_MS
984
+ this.procRoot = options.procRoot || "/proc"
985
+ this.drmFdinfoMaxPids = options.drmFdinfoMaxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
986
+ this.drmFdinfoMaxFdsPerPid = options.drmFdinfoMaxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
987
+ this.windowsPdhClient = options.windowsPdhClient || null
988
+ this.nvmlClient = options.nvmlClient || null
989
+ this.amdSmiClient = options.amdSmiClient || null
990
+ this.rocmSmiClient = options.rocmSmiClient || null
165
991
  this.current = null
992
+ this.currentCacheKey = null
166
993
  this.inFlight = null
994
+ this.inFlightCacheKey = null
167
995
  this.providerBackoff = new Map()
168
- }
169
-
170
- nvidiaCandidates() {
171
- const platform = os.platform()
172
- const candidates = [
173
- process.env.NVIDIA_SMI,
174
- "nvidia-smi",
175
- ...getPinokioCondaCandidates(this.kernel, ["nvidia-smi"])
176
- ]
177
- if (platform === "win32") {
178
- candidates.push(
179
- "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe",
180
- "C:\\Windows\\System32\\nvidia-smi.exe"
181
- )
182
- } else if (platform === "linux") {
183
- candidates.push(
184
- "/usr/bin/nvidia-smi",
185
- "/usr/local/bin/nvidia-smi",
186
- "/usr/local/nvidia/bin/nvidia-smi",
187
- "/usr/local/cuda/bin/nvidia-smi"
188
- )
189
- }
190
- return executableCandidates(candidates)
191
- }
192
-
193
- amdCandidates() {
194
- const candidates = [
195
- process.env.AMD_SMI,
196
- "amd-smi",
197
- ...getPinokioCondaCandidates(this.kernel, ["amd-smi"])
198
- ]
199
- if (os.platform() === "linux") {
200
- candidates.push("/opt/rocm/bin/amd-smi", "/usr/bin/amd-smi", "/usr/local/bin/amd-smi")
201
- }
202
- return executableCandidates(candidates)
996
+ this.providerLogBackoff = new Map()
203
997
  }
204
998
 
205
999
  isBackedOff(provider) {
@@ -211,75 +1005,161 @@ class GpuSampler {
211
1005
  this.providerBackoff.set(provider, Date.now() + ms)
212
1006
  }
213
1007
 
214
- async collectNvidia() {
215
- if (this.isBackedOff("nvidia")) {
1008
+ logProviderFailure(provider, error, pids, fallbackMessage = "GPU provider unavailable", ms = 60000) {
1009
+ const now = Date.now()
1010
+ const until = this.providerLogBackoff.get(provider) || 0
1011
+ if (now < until) return
1012
+ this.providerLogBackoff.set(provider, now + ms)
1013
+
1014
+ const summary = {
1015
+ provider,
1016
+ platform: this.platform,
1017
+ pid_count: normalizePidSet(pids).length,
1018
+ error: error && error.message ? error.message : fallbackMessage
1019
+ }
1020
+ const code = error && (error.code || error.errno || error.status)
1021
+ if (code != null) {
1022
+ summary.code = String(code)
1023
+ }
1024
+ try {
1025
+ console.warn("[resource-usage:gpu] provider failed", summary)
1026
+ } catch (_) {}
1027
+ }
1028
+
1029
+ getWindowsPdhClient() {
1030
+ if (!this.windowsPdhClient) {
1031
+ this.windowsPdhClient = new WindowsPdhGpuMemoryClient()
1032
+ }
1033
+ return this.windowsPdhClient
1034
+ }
1035
+
1036
+ getNvmlClient() {
1037
+ if (!this.nvmlClient) {
1038
+ this.nvmlClient = new NvmlGpuMemoryClient()
1039
+ }
1040
+ return this.nvmlClient
1041
+ }
1042
+
1043
+ getAmdSmiClient() {
1044
+ if (!this.amdSmiClient) {
1045
+ this.amdSmiClient = new AmdSmiGpuMemoryClient()
1046
+ }
1047
+ return this.amdSmiClient
1048
+ }
1049
+
1050
+ getRocmSmiClient() {
1051
+ if (!this.rocmSmiClient) {
1052
+ this.rocmSmiClient = new RocmSmiGpuMemoryClient()
1053
+ }
1054
+ return this.rocmSmiClient
1055
+ }
1056
+
1057
+ async collectWindowsPdh(pids) {
1058
+ if (this.platform !== "win32" || this.isBackedOff("windows-pdh")) {
216
1059
  return null
217
1060
  }
218
- const args = [
219
- "--query-compute-apps=pid,used_gpu_memory",
220
- "--format=csv,noheader,nounits"
221
- ]
222
- let lastError = null
223
- for (const command of this.nvidiaCandidates()) {
224
- try {
225
- const { stdout } = await execFileText(command, args, { timeoutMs: this.timeoutMs })
226
- return {
227
- provider: "nvidia-smi",
228
- processes: parseNvidiaCsv(stdout),
229
- error: null
230
- }
231
- } catch (error) {
232
- lastError = error
233
- if (error && error.code === "ENOENT") {
234
- continue
235
- }
236
- break
1061
+ try {
1062
+ return {
1063
+ provider: "windows-pdh",
1064
+ processes: this.getWindowsPdhClient().collect(pids),
1065
+ error: null
1066
+ }
1067
+ } catch (error) {
1068
+ this.logProviderFailure("windows-pdh", error, pids, "Windows PDH unavailable")
1069
+ this.backoff("windows-pdh", 60000)
1070
+ return {
1071
+ provider: "windows-pdh",
1072
+ processes: new Map(),
1073
+ error: error && error.message ? error.message : "Windows PDH unavailable"
237
1074
  }
238
1075
  }
239
- this.backoff("nvidia", 60000)
240
- return {
241
- provider: "nvidia-smi",
242
- processes: new Map(),
243
- error: lastError && lastError.message ? lastError.message : "nvidia-smi unavailable"
1076
+ }
1077
+
1078
+ async collectLinuxDrmFdinfo(pids) {
1079
+ if (this.platform !== "linux" || this.isBackedOff("linux-drm-fdinfo") || pids == null) {
1080
+ return null
1081
+ }
1082
+ const targetPids = normalizePidSet(pids)
1083
+ if (targetPids.length === 0) {
1084
+ return null
1085
+ }
1086
+ try {
1087
+ const processes = await collectLinuxDrmFdinfoProcesses(targetPids, {
1088
+ procRoot: this.procRoot,
1089
+ maxPids: this.drmFdinfoMaxPids,
1090
+ maxFdsPerPid: this.drmFdinfoMaxFdsPerPid
1091
+ })
1092
+ if (processes.size === 0) {
1093
+ return null
1094
+ }
1095
+ return {
1096
+ provider: "linux-drm-fdinfo",
1097
+ processes,
1098
+ error: null
1099
+ }
1100
+ } catch (error) {
1101
+ this.logProviderFailure("linux-drm-fdinfo", error, pids, "Linux DRM fdinfo unavailable")
1102
+ this.backoff("linux-drm-fdinfo", 60000)
1103
+ return {
1104
+ provider: "linux-drm-fdinfo",
1105
+ processes: new Map(),
1106
+ error: error && error.message ? error.message : "Linux DRM fdinfo unavailable"
1107
+ }
244
1108
  }
245
1109
  }
246
1110
 
247
- async collectAmd() {
248
- if (os.platform() !== "linux" || this.isBackedOff("amd")) {
1111
+ async collectNvml(pids) {
1112
+ if (this.platform !== "linux" || this.isBackedOff("linux-nvml")) {
249
1113
  return null
250
1114
  }
251
- let lastError = null
252
- for (const command of this.amdCandidates()) {
253
- try {
254
- const { stdout } = await execFileText(command, ["process", "--json", "-G"], { timeoutMs: this.timeoutMs })
255
- return {
256
- provider: "amd-smi",
257
- processes: parseAmdJson(stdout),
258
- error: null
259
- }
260
- } catch (error) {
261
- lastError = error
262
- if (error && error.code === "ENOENT") {
263
- continue
264
- }
265
- break
1115
+ try {
1116
+ return {
1117
+ provider: "linux-nvml",
1118
+ processes: this.getNvmlClient().collect(pids),
1119
+ error: null
266
1120
  }
1121
+ } catch (error) {
1122
+ this.logProviderFailure("linux-nvml", error, pids, "Linux NVML unavailable")
1123
+ this.backoff("linux-nvml", 60000)
1124
+ return null
267
1125
  }
268
- this.backoff("amd", 90000)
269
- return {
270
- provider: "amd-smi",
271
- processes: new Map(),
272
- error: lastError && lastError.message ? lastError.message : "amd-smi unavailable"
1126
+ }
1127
+
1128
+ async collectAmdSmi(pids) {
1129
+ if (this.platform !== "linux" || this.isBackedOff("linux-amdsmi")) {
1130
+ return null
1131
+ }
1132
+ try {
1133
+ return {
1134
+ provider: "linux-amdsmi",
1135
+ processes: this.getAmdSmiClient().collect(pids),
1136
+ error: null
1137
+ }
1138
+ } catch (error) {
1139
+ this.logProviderFailure("linux-amdsmi", error, pids, "Linux AMD SMI unavailable")
1140
+ this.backoff("linux-amdsmi", 60000)
1141
+ return null
273
1142
  }
274
1143
  }
275
1144
 
276
- async collect() {
277
- const results = []
278
- const nvidia = await this.collectNvidia()
279
- if (nvidia) results.push(nvidia)
280
- const amd = await this.collectAmd()
281
- if (amd) results.push(amd)
1145
+ async collectRocmSmi(pids) {
1146
+ if (this.platform !== "linux" || this.isBackedOff("linux-rocm-smi")) {
1147
+ return null
1148
+ }
1149
+ try {
1150
+ return {
1151
+ provider: "linux-rocm-smi",
1152
+ processes: this.getRocmSmiClient().collect(pids),
1153
+ error: null
1154
+ }
1155
+ } catch (error) {
1156
+ this.logProviderFailure("linux-rocm-smi", error, pids, "Linux ROCm SMI unavailable")
1157
+ this.backoff("linux-rocm-smi", 60000)
1158
+ return null
1159
+ }
1160
+ }
282
1161
 
1162
+ mergeResults(results) {
283
1163
  const processes = new Map()
284
1164
  const providers = []
285
1165
  const errors = []
@@ -288,9 +1168,56 @@ class GpuSampler {
288
1168
  if (result.provider) providers.push(result.provider)
289
1169
  if (result.error) errors.push({ provider: result.provider, error: result.error })
290
1170
  for (const entry of result.processes.values()) {
291
- addGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
1171
+ mergeGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
292
1172
  }
293
1173
  }
1174
+ return { processes, providers, errors }
1175
+ }
1176
+
1177
+ async collect(pids = null) {
1178
+ if (this.platform === "darwin") {
1179
+ return {
1180
+ available: false,
1181
+ stale: false,
1182
+ collectedAt: Date.now(),
1183
+ providers: [],
1184
+ processes: new Map(),
1185
+ errors: []
1186
+ }
1187
+ }
1188
+
1189
+ const results = []
1190
+ if (this.platform === "win32") {
1191
+ const windowsPdh = await this.collectWindowsPdh(pids)
1192
+ if (windowsPdh) results.push(windowsPdh)
1193
+ } else if (this.platform === "linux") {
1194
+ const linuxDrmFdinfo = await this.collectLinuxDrmFdinfo(pids)
1195
+ if (linuxDrmFdinfo) results.push(linuxDrmFdinfo)
1196
+
1197
+ let merged = this.mergeResults(results)
1198
+ const covered = coveredPids(merged.processes)
1199
+
1200
+ if (hasUncoveredTarget(pids, covered)) {
1201
+ const nvml = await this.collectNvml(pids)
1202
+ if (nvml) results.push(nvml)
1203
+ }
1204
+
1205
+ merged = this.mergeResults(results)
1206
+ const afterNvmlCovered = coveredPids(merged.processes)
1207
+ if (hasUncoveredTarget(pids, afterNvmlCovered)) {
1208
+ const amdSmi = await this.collectAmdSmi(pids)
1209
+ if (amdSmi) results.push(amdSmi)
1210
+ }
1211
+
1212
+ merged = this.mergeResults(results)
1213
+ const afterAmdCovered = coveredPids(merged.processes)
1214
+ if (hasUncoveredTarget(pids, afterAmdCovered)) {
1215
+ const rocmSmi = await this.collectRocmSmi(pids)
1216
+ if (rocmSmi) results.push(rocmSmi)
1217
+ }
1218
+ }
1219
+
1220
+ const { processes, providers, errors } = this.mergeResults(results)
294
1221
  return {
295
1222
  available: providers.length > 0 && errors.length < providers.length,
296
1223
  stale: false,
@@ -301,18 +1228,22 @@ class GpuSampler {
301
1228
  }
302
1229
  }
303
1230
 
304
- async getSnapshot() {
1231
+ async getSnapshot(pids = null) {
305
1232
  const now = Date.now()
306
- if (this.current && now - this.current.collectedAt < this.ttlMs) {
1233
+ const cacheKey = this.platform === "darwin" ? "" : normalizePidSet(pids).join(",")
1234
+ if (this.current && this.currentCacheKey === cacheKey && now - this.current.collectedAt < this.ttlMs) {
307
1235
  return this.current
308
1236
  }
309
- if (this.inFlight) {
1237
+ if (this.inFlight && this.inFlightCacheKey === cacheKey) {
310
1238
  return this.inFlight
311
1239
  }
312
- this.inFlight = this.collect().then((snapshot) => {
1240
+ this.inFlightCacheKey = cacheKey
1241
+ this.inFlight = this.collect(pids).then((snapshot) => {
313
1242
  this.current = snapshot
1243
+ this.currentCacheKey = cacheKey
314
1244
  return snapshot
315
1245
  }).catch((error) => {
1246
+ this.logProviderFailure("gpu", error, pids, "GPU sampling unavailable")
316
1247
  if (this.current) {
317
1248
  return { ...this.current, stale: true }
318
1249
  }
@@ -326,9 +1257,18 @@ class GpuSampler {
326
1257
  }
327
1258
  }).finally(() => {
328
1259
  this.inFlight = null
1260
+ this.inFlightCacheKey = null
329
1261
  })
330
1262
  return this.inFlight
331
1263
  }
1264
+
1265
+ stop() {
1266
+ for (const client of [this.windowsPdhClient, this.nvmlClient, this.amdSmiClient, this.rocmSmiClient]) {
1267
+ if (client && typeof client.stop === "function") {
1268
+ client.stop()
1269
+ }
1270
+ }
1271
+ }
332
1272
  }
333
1273
 
334
1274
  function sumGpuMemory(snapshot, pids) {
@@ -344,6 +1284,15 @@ function sumGpuMemory(snapshot, pids) {
344
1284
 
345
1285
  module.exports = {
346
1286
  GpuSampler,
1287
+ WindowsPdhGpuMemoryClient,
1288
+ NvmlGpuMemoryClient,
1289
+ AmdSmiGpuMemoryClient,
1290
+ RocmSmiGpuMemoryClient,
347
1291
  parseMemoryToBytes,
1292
+ decodeWindowsMultiSz,
1293
+ extractPidFromWindowsGpuInstance,
1294
+ collectLinuxDrmFdinfoProcesses,
1295
+ isDedicatedDrmMemoryRegion,
1296
+ parseLinuxDrmFdinfo,
348
1297
  sumGpuMemory
349
1298
  }