pinokiod 7.3.5 → 7.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/kernel/api/index.js +3 -2
- package/kernel/api/script/index.js +1 -0
- package/kernel/resource_usage/gpu.js +1078 -270
- package/kernel/resource_usage/index.js +9 -4
- package/package.json +2 -1
- package/server/index.js +14 -2
- package/server/public/nav.js +1 -1
- package/server/public/style.css +298 -191
- package/server/public/task-launcher.css +16 -20
- package/server/public/universal-launcher.css +0 -113
- package/server/public/universal-launcher.js +1 -1
- package/server/views/app.ejs +592 -298
- package/server/views/autolaunch.ejs +1 -1
- package/server/views/checkpoints.ejs +2 -6
- package/server/views/connect.ejs +1 -1
- package/server/views/explore.ejs +2 -1
- package/server/views/index.ejs +89 -60
- package/server/views/install.ejs +5 -7
- package/server/views/invalid_content.ejs +1 -1
- package/server/views/layout.ejs +8 -2
- package/server/views/logs.ejs +5 -27
- package/server/views/net.ejs +1 -1
- package/server/views/network.ejs +1 -1
- package/server/views/partials/fs_status.ejs +0 -8
- package/server/views/partials/main_sidebar.ejs +108 -44
- package/server/views/plugin_detail.ejs +1 -1
- package/server/views/plugins.ejs +1 -28
- package/server/views/screenshots.ejs +1 -1
- package/server/views/settings.ejs +2 -1
- package/server/views/setup.ejs +15 -1
- package/server/views/skills.ejs +1 -1
- package/server/views/task_builder.ejs +1 -1
- package/server/views/task_install.ejs +1 -1
- package/server/views/task_launch.ejs +1 -1
- package/server/views/task_list.ejs +1 -1
- package/server/views/tools.ejs +1 -1
- package/test/resource-usage-gpu.test.js +320 -70
- package/test/script-api.test.js +90 -0
|
@@ -3,13 +3,31 @@
|
|
|
3
3
|
const fs = require("fs")
|
|
4
4
|
const os = require("os")
|
|
5
5
|
const path = require("path")
|
|
6
|
-
const {
|
|
6
|
+
const { normalizePid } = require("./process_tree")
|
|
7
7
|
|
|
8
|
-
const DEFAULT_GPU_TTL_MS =
|
|
9
|
-
const
|
|
10
|
-
const
|
|
8
|
+
const DEFAULT_GPU_TTL_MS = 5000
|
|
9
|
+
const DEFAULT_DRM_FDINFO_MAX_PIDS = 4096
|
|
10
|
+
const DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID = 1024
|
|
11
11
|
const MIB = 1024 * 1024
|
|
12
12
|
|
|
13
|
+
const WINDOWS_GPU_PROCESS_COUNTER = "\\GPU Process Memory(*)\\Dedicated Usage"
|
|
14
|
+
const ERROR_SUCCESS = 0
|
|
15
|
+
const PDH_MORE_DATA = 0x800007D2
|
|
16
|
+
const PDH_INVALID_PATH = 0xC0000BC4
|
|
17
|
+
const PDH_INVALID_DATA = 0xC0000BC6
|
|
18
|
+
const PDH_NO_DATA = 0x800007D5
|
|
19
|
+
const PDH_FMT_LARGE = 0x00000400
|
|
20
|
+
|
|
21
|
+
const NVML_SUCCESS = 0
|
|
22
|
+
const NVML_ERROR_INSUFFICIENT_SIZE = 7
|
|
23
|
+
const NVML_VALUE_NOT_AVAILABLE = 0xFFFFFFFFFFFFFFFFn
|
|
24
|
+
|
|
25
|
+
const AMDSMI_INIT_AMD_GPUS = 1 << 1
|
|
26
|
+
const RSMI_INIT_DEFAULT = 0
|
|
27
|
+
|
|
28
|
+
let koffiModule
|
|
29
|
+
const koffiTypeCache = new WeakMap()
|
|
30
|
+
|
|
13
31
|
function unique(values) {
|
|
14
32
|
const seen = new Set()
|
|
15
33
|
const next = []
|
|
@@ -21,54 +39,198 @@ function unique(values) {
|
|
|
21
39
|
return next
|
|
22
40
|
}
|
|
23
41
|
|
|
24
|
-
function
|
|
42
|
+
function loadKoffi() {
|
|
43
|
+
if (koffiModule !== undefined) {
|
|
44
|
+
return koffiModule
|
|
45
|
+
}
|
|
25
46
|
try {
|
|
26
|
-
|
|
27
|
-
return true
|
|
47
|
+
koffiModule = require("koffi")
|
|
28
48
|
} catch (_) {
|
|
29
|
-
|
|
49
|
+
koffiModule = null
|
|
30
50
|
}
|
|
51
|
+
return koffiModule
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function getCachedKoffiTypes(koffi, key, factory) {
|
|
55
|
+
let cache = koffiTypeCache.get(koffi)
|
|
56
|
+
if (!cache) {
|
|
57
|
+
cache = new Map()
|
|
58
|
+
koffiTypeCache.set(koffi, cache)
|
|
59
|
+
}
|
|
60
|
+
if (!cache.has(key)) {
|
|
61
|
+
cache.set(key, factory())
|
|
62
|
+
}
|
|
63
|
+
return cache.get(key)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function getWindowsPdhTypes(koffi) {
|
|
67
|
+
return getCachedKoffiTypes(koffi, "windows-pdh", () => {
|
|
68
|
+
const counterValue = koffi.struct("PDH_FMT_COUNTERVALUE", {
|
|
69
|
+
CStatus: "uint32_t",
|
|
70
|
+
largeValue: "int64_t"
|
|
71
|
+
})
|
|
72
|
+
const counterInfo = koffi.struct("PDH_COUNTER_INFO_W_PREFIX", {
|
|
73
|
+
dwLength: "uint32_t",
|
|
74
|
+
dwType: "uint32_t",
|
|
75
|
+
CVersion: "uint32_t",
|
|
76
|
+
CStatus: "uint32_t",
|
|
77
|
+
lScale: "int32_t",
|
|
78
|
+
lDefaultScale: "int32_t",
|
|
79
|
+
dwUserData: "uintptr_t",
|
|
80
|
+
dwQueryUserData: "uintptr_t",
|
|
81
|
+
szFullPath: "str16"
|
|
82
|
+
})
|
|
83
|
+
return { counterValue, counterInfo }
|
|
84
|
+
})
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function getNvmlTypes(koffi) {
|
|
88
|
+
return getCachedKoffiTypes(koffi, "nvml", () => {
|
|
89
|
+
const processInfoV1 = koffi.struct("nvmlProcessInfo_v1_t", {
|
|
90
|
+
pid: "uint32_t",
|
|
91
|
+
usedGpuMemory: "uint64_t"
|
|
92
|
+
})
|
|
93
|
+
const processInfoV2 = koffi.struct("nvmlProcessInfo_v2_t", {
|
|
94
|
+
pid: "uint32_t",
|
|
95
|
+
usedGpuMemory: "uint64_t",
|
|
96
|
+
gpuInstanceId: "uint32_t",
|
|
97
|
+
computeInstanceId: "uint32_t"
|
|
98
|
+
})
|
|
99
|
+
return { processInfoV1, processInfoV2 }
|
|
100
|
+
})
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function getAmdSmiTypes(koffi) {
|
|
104
|
+
return getCachedKoffiTypes(koffi, "amdsmi", () => {
|
|
105
|
+
const engineUsage = koffi.struct("amdsmi_engine_usage_process_t", {
|
|
106
|
+
gfx: "uint64_t",
|
|
107
|
+
enc: "uint64_t",
|
|
108
|
+
reserved: koffi.array("uint32_t", 12)
|
|
109
|
+
})
|
|
110
|
+
const memoryUsage = koffi.struct("amdsmi_memory_usage_process_t", {
|
|
111
|
+
gtt_mem: "uint64_t",
|
|
112
|
+
cpu_mem: "uint64_t",
|
|
113
|
+
vram_mem: "uint64_t",
|
|
114
|
+
reserved: koffi.array("uint32_t", 10)
|
|
115
|
+
})
|
|
116
|
+
const procInfo = koffi.struct("amdsmi_proc_info_t", {
|
|
117
|
+
name: koffi.array("char", 256),
|
|
118
|
+
pid: "uint32_t",
|
|
119
|
+
mem: "uint64_t",
|
|
120
|
+
engine_usage: engineUsage,
|
|
121
|
+
memory_usage: memoryUsage,
|
|
122
|
+
container_name: koffi.array("char", 256),
|
|
123
|
+
cu_occupancy: "uint32_t",
|
|
124
|
+
evicted_time: "uint32_t",
|
|
125
|
+
reserved: koffi.array("uint32_t", 10)
|
|
126
|
+
})
|
|
127
|
+
return { procInfo }
|
|
128
|
+
})
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function getRocmSmiTypes(koffi) {
|
|
132
|
+
return getCachedKoffiTypes(koffi, "rocm-smi", () => {
|
|
133
|
+
const procInfo = koffi.struct("rsmi_process_info_t", {
|
|
134
|
+
process_id: "uint32_t",
|
|
135
|
+
pasid: "uint32_t",
|
|
136
|
+
vram_usage: "uint64_t",
|
|
137
|
+
sdma_usage: "uint64_t",
|
|
138
|
+
cu_occupancy: "uint32_t"
|
|
139
|
+
})
|
|
140
|
+
return { procInfo }
|
|
141
|
+
})
|
|
31
142
|
}
|
|
32
143
|
|
|
33
|
-
function
|
|
144
|
+
function existingLibraryCandidates(candidates) {
|
|
34
145
|
return unique(candidates).filter((candidate) => {
|
|
35
146
|
if (!candidate) return false
|
|
36
|
-
if (path.isAbsolute(candidate))
|
|
37
|
-
|
|
147
|
+
if (!path.isAbsolute(candidate)) return true
|
|
148
|
+
try {
|
|
149
|
+
return fs.existsSync(candidate)
|
|
150
|
+
} catch (_) {
|
|
151
|
+
return false
|
|
38
152
|
}
|
|
39
|
-
return true
|
|
40
153
|
})
|
|
41
154
|
}
|
|
42
155
|
|
|
43
|
-
function
|
|
44
|
-
|
|
45
|
-
|
|
156
|
+
function rocmLibraryCandidates(filename) {
|
|
157
|
+
const roots = unique([
|
|
158
|
+
process.env.ROCM_PATH,
|
|
159
|
+
process.env.ROCM_HOME,
|
|
160
|
+
"/opt/rocm",
|
|
161
|
+
"/usr",
|
|
162
|
+
"/usr/local"
|
|
163
|
+
])
|
|
164
|
+
const candidates = [filename]
|
|
165
|
+
for (const root of roots) {
|
|
166
|
+
candidates.push(
|
|
167
|
+
path.join(root, "lib", filename),
|
|
168
|
+
path.join(root, "lib64", filename)
|
|
169
|
+
)
|
|
46
170
|
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
171
|
+
candidates.push(
|
|
172
|
+
path.join("/usr/lib/x86_64-linux-gnu", filename),
|
|
173
|
+
path.join("/usr/lib/aarch64-linux-gnu", filename),
|
|
174
|
+
path.join("/usr/local/lib", filename)
|
|
175
|
+
)
|
|
176
|
+
return existingLibraryCandidates(candidates)
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function loadFirstLibrary(koffi, candidates, options = {}) {
|
|
180
|
+
let lastError = null
|
|
181
|
+
for (const candidate of existingLibraryCandidates(candidates)) {
|
|
182
|
+
try {
|
|
183
|
+
return koffi.load(candidate, options)
|
|
184
|
+
} catch (error) {
|
|
185
|
+
lastError = error
|
|
60
186
|
}
|
|
61
187
|
}
|
|
62
|
-
|
|
188
|
+
throw lastError || new Error("native GPU library unavailable")
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function optionalFunction(library, definitions) {
|
|
192
|
+
for (const definition of definitions) {
|
|
193
|
+
try {
|
|
194
|
+
return library.func(definition)
|
|
195
|
+
} catch (_) {}
|
|
196
|
+
}
|
|
197
|
+
return null
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function statusCode(value) {
|
|
201
|
+
return Number(value) >>> 0
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function isStatus(value, expected) {
|
|
205
|
+
return statusCode(value) === (expected >>> 0)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function isSuccess(value) {
|
|
209
|
+
return isStatus(value, ERROR_SUCCESS)
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function isNoDataStatus(value) {
|
|
213
|
+
return isStatus(value, PDH_INVALID_PATH) || isStatus(value, PDH_INVALID_DATA) || isStatus(value, PDH_NO_DATA)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function toSafeNumber(value) {
|
|
217
|
+
if (typeof value === "bigint") {
|
|
218
|
+
if (value < 0n || value > BigInt(Number.MAX_SAFE_INTEGER)) return null
|
|
219
|
+
return Number(value)
|
|
220
|
+
}
|
|
221
|
+
const number = Number(value)
|
|
222
|
+
if (!Number.isFinite(number) || number < 0) return null
|
|
223
|
+
return number
|
|
63
224
|
}
|
|
64
225
|
|
|
65
226
|
function parseMemoryToBytes(value, defaultUnit = "") {
|
|
66
227
|
if (value == null) return null
|
|
67
|
-
if (typeof value === "number") {
|
|
68
|
-
|
|
69
|
-
if (
|
|
70
|
-
if (defaultUnit === "
|
|
71
|
-
return Math.round(
|
|
228
|
+
if (typeof value === "number" || typeof value === "bigint") {
|
|
229
|
+
const number = toSafeNumber(value)
|
|
230
|
+
if (number == null) return null
|
|
231
|
+
if (defaultUnit === "mib") return Math.round(number * MIB)
|
|
232
|
+
if (defaultUnit === "kb") return Math.round(number * 1024)
|
|
233
|
+
return Math.round(number)
|
|
72
234
|
}
|
|
73
235
|
const raw = String(value).trim()
|
|
74
236
|
if (!raw || /N\/A|not supported|none/i.test(raw)) {
|
|
@@ -112,112 +274,706 @@ function mergeGpuProcess(processes, pid, bytes) {
|
|
|
112
274
|
processes.set(normalizedPid, current)
|
|
113
275
|
}
|
|
114
276
|
|
|
115
|
-
function
|
|
116
|
-
const
|
|
117
|
-
for (const
|
|
118
|
-
const
|
|
119
|
-
if (
|
|
120
|
-
const parts = trimmed.split(",").map((part) => part.trim())
|
|
121
|
-
const pid = normalizePid(parts[0])
|
|
122
|
-
const bytes = parseMemoryToBytes(parts[1], "mib")
|
|
123
|
-
addGpuProcess(processes, pid, bytes)
|
|
277
|
+
function normalizePidSet(values) {
|
|
278
|
+
const pids = []
|
|
279
|
+
for (const value of values || []) {
|
|
280
|
+
const pid = normalizePid(value)
|
|
281
|
+
if (pid) pids.push(pid)
|
|
124
282
|
}
|
|
125
|
-
return
|
|
283
|
+
return Array.from(new Set(pids)).sort((a, b) => a - b)
|
|
126
284
|
}
|
|
127
285
|
|
|
128
|
-
function
|
|
129
|
-
const
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
}
|
|
142
|
-
} else if (char === "," && !inQuotes) {
|
|
143
|
-
values.push(current)
|
|
144
|
-
current = ""
|
|
145
|
-
} else {
|
|
146
|
-
current += char
|
|
286
|
+
function filterProcessMap(processes, pids) {
|
|
287
|
+
const targetPids = normalizePidSet(pids)
|
|
288
|
+
if (targetPids.length === 0 && pids != null) {
|
|
289
|
+
return new Map()
|
|
290
|
+
}
|
|
291
|
+
if (targetPids.length === 0) {
|
|
292
|
+
return processes
|
|
293
|
+
}
|
|
294
|
+
const targetSet = new Set(targetPids)
|
|
295
|
+
const filtered = new Map()
|
|
296
|
+
for (const entry of processes.values()) {
|
|
297
|
+
if (targetSet.has(entry.pid)) {
|
|
298
|
+
filtered.set(entry.pid, entry)
|
|
147
299
|
}
|
|
148
300
|
}
|
|
149
|
-
|
|
150
|
-
|
|
301
|
+
return filtered
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function coveredPids(processes) {
|
|
305
|
+
return new Set(Array.from(processes.keys()))
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function hasUncoveredTarget(pids, covered) {
|
|
309
|
+
const targetPids = normalizePidSet(pids)
|
|
310
|
+
if (pids == null) return true
|
|
311
|
+
if (targetPids.length === 0) return false
|
|
312
|
+
for (const pid of targetPids) {
|
|
313
|
+
if (!covered.has(pid)) return true
|
|
314
|
+
}
|
|
315
|
+
return false
|
|
151
316
|
}
|
|
152
317
|
|
|
153
|
-
function
|
|
154
|
-
const
|
|
318
|
+
function extractPidFromWindowsGpuInstance(instanceName) {
|
|
319
|
+
const match = /(?:^|[^a-z0-9])pid[_\s-]*(\d+)(?:\D|$)/i.exec(String(instanceName || ""))
|
|
320
|
+
return normalizePid(match && match[1])
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
function decodeWindowsMultiSz(buffer, charCount) {
|
|
324
|
+
const values = []
|
|
325
|
+
let start = 0
|
|
326
|
+
const count = Math.max(0, Math.min(charCount || 0, Math.floor(buffer.length / 2)))
|
|
327
|
+
for (let i = 0; i < count; i += 1) {
|
|
328
|
+
const char = buffer.readUInt16LE(i * 2)
|
|
329
|
+
if (char !== 0) continue
|
|
330
|
+
if (i === start) break
|
|
331
|
+
values.push(buffer.subarray(start * 2, i * 2).toString("utf16le"))
|
|
332
|
+
start = i + 1
|
|
333
|
+
}
|
|
334
|
+
return values.filter(Boolean)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
function isDedicatedDrmMemoryRegion(region) {
|
|
338
|
+
const normalized = String(region || "")
|
|
339
|
+
.trim()
|
|
340
|
+
.toLowerCase()
|
|
341
|
+
.replace(/[_\s]+/g, "-")
|
|
342
|
+
const compact = normalized.replace(/[^a-z0-9]/g, "")
|
|
343
|
+
if (!compact || /^(system|gtt|memory|shared|stolen|cpu|host)\d*$/.test(compact)) {
|
|
344
|
+
return false
|
|
345
|
+
}
|
|
346
|
+
return /^vram\d*$/.test(compact) || /^local\d*$/.test(compact)
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function parseLinuxDrmFdinfo(stdout) {
|
|
350
|
+
const fields = new Map()
|
|
155
351
|
for (const line of String(stdout || "").split(/\r?\n/)) {
|
|
156
|
-
const
|
|
157
|
-
if (
|
|
158
|
-
const
|
|
159
|
-
|
|
352
|
+
const separator = line.indexOf(":")
|
|
353
|
+
if (separator < 0) continue
|
|
354
|
+
const key = line.slice(0, separator).trim().toLowerCase()
|
|
355
|
+
const value = line.slice(separator + 1).trim()
|
|
356
|
+
if (key) fields.set(key, value)
|
|
160
357
|
}
|
|
161
|
-
|
|
162
|
-
|
|
358
|
+
|
|
359
|
+
const driver = fields.get("drm-driver")
|
|
360
|
+
if (!driver) {
|
|
361
|
+
return null
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let residentBytes = 0
|
|
365
|
+
let legacyMemoryBytes = 0
|
|
366
|
+
let hasResidentDedicatedMemory = false
|
|
367
|
+
for (const [key, value] of fields.entries()) {
|
|
368
|
+
const match = /^drm-(resident|memory)-(.+)$/.exec(key)
|
|
369
|
+
if (!match || !isDedicatedDrmMemoryRegion(match[2])) continue
|
|
370
|
+
const bytes = parseMemoryToBytes(value)
|
|
371
|
+
if (!Number.isFinite(bytes) || bytes < 0) continue
|
|
372
|
+
if (match[1] === "resident") {
|
|
373
|
+
hasResidentDedicatedMemory = true
|
|
374
|
+
residentBytes += bytes
|
|
375
|
+
} else {
|
|
376
|
+
legacyMemoryBytes += bytes
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return {
|
|
381
|
+
driver,
|
|
382
|
+
pdev: fields.get("drm-pdev") || "",
|
|
383
|
+
clientId: fields.get("drm-client-id") || "",
|
|
384
|
+
dedicatedBytes: hasResidentDedicatedMemory ? residentBytes : legacyMemoryBytes
|
|
163
385
|
}
|
|
164
|
-
|
|
165
|
-
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
async function collectLinuxDrmFdinfoProcesses(pids, options = {}) {
|
|
389
|
+
const procRoot = options.procRoot || "/proc"
|
|
390
|
+
const maxPids = options.maxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
|
|
391
|
+
const maxFdsPerPid = options.maxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
|
|
392
|
+
const targetPids = normalizePidSet(pids).slice(0, maxPids)
|
|
393
|
+
const byClient = new Map()
|
|
394
|
+
|
|
395
|
+
for (const pid of targetPids) {
|
|
396
|
+
const fdinfoDir = path.join(procRoot, String(pid), "fdinfo")
|
|
397
|
+
let entries = []
|
|
398
|
+
try {
|
|
399
|
+
entries = await fs.promises.readdir(fdinfoDir, { withFileTypes: true })
|
|
400
|
+
} catch (_) {
|
|
401
|
+
continue
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
let scannedFds = 0
|
|
405
|
+
for (const entry of entries) {
|
|
406
|
+
const name = entry && entry.name ? entry.name : ""
|
|
407
|
+
if (!/^\d+$/.test(name)) continue
|
|
408
|
+
scannedFds += 1
|
|
409
|
+
if (scannedFds > maxFdsPerPid) break
|
|
410
|
+
|
|
411
|
+
let stdout = ""
|
|
412
|
+
try {
|
|
413
|
+
stdout = await fs.promises.readFile(path.join(fdinfoDir, name), "utf8")
|
|
414
|
+
} catch (_) {
|
|
415
|
+
continue
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const parsed = parseLinuxDrmFdinfo(stdout)
|
|
419
|
+
if (!parsed || !(parsed.dedicatedBytes > 0)) continue
|
|
420
|
+
const clientKey = parsed.clientId
|
|
421
|
+
? `client:${parsed.clientId}`
|
|
422
|
+
: "unknown-client"
|
|
423
|
+
const key = [
|
|
424
|
+
pid,
|
|
425
|
+
parsed.driver || "unknown-driver",
|
|
426
|
+
parsed.pdev || "unknown-device",
|
|
427
|
+
clientKey
|
|
428
|
+
].join(":")
|
|
429
|
+
const current = byClient.get(key)
|
|
430
|
+
byClient.set(key, {
|
|
431
|
+
pid,
|
|
432
|
+
bytes: current ? Math.max(current.bytes, parsed.dedicatedBytes) : parsed.dedicatedBytes
|
|
433
|
+
})
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
166
437
|
const processes = new Map()
|
|
167
|
-
for (
|
|
168
|
-
|
|
169
|
-
const match = /pid[_\s-]*(\d+)/i.exec(instanceName)
|
|
170
|
-
const pid = normalizePid(match && match[1])
|
|
171
|
-
const bytes = parseMemoryToBytes(values[i])
|
|
172
|
-
addGpuProcess(processes, pid, bytes)
|
|
438
|
+
for (const entry of byClient.values()) {
|
|
439
|
+
addGpuProcess(processes, entry.pid, entry.bytes)
|
|
173
440
|
}
|
|
174
441
|
return processes
|
|
175
442
|
}
|
|
176
443
|
|
|
177
|
-
|
|
178
|
-
|
|
444
|
+
class WindowsPdhGpuMemoryClient {
|
|
445
|
+
constructor(options = {}) {
|
|
446
|
+
this.koffi = options.koffi || loadKoffi()
|
|
447
|
+
this.library = null
|
|
448
|
+
this.query = null
|
|
449
|
+
this.counters = []
|
|
450
|
+
this.counterValueType = null
|
|
451
|
+
this.counterInfoType = null
|
|
452
|
+
this.functions = null
|
|
453
|
+
this.counterRefreshMs = options.counterRefreshMs || DEFAULT_GPU_TTL_MS
|
|
454
|
+
this.lastCounterRefreshAt = 0
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
init() {
|
|
458
|
+
if (this.functions) return
|
|
459
|
+
if (!this.koffi) {
|
|
460
|
+
throw new Error("koffi unavailable")
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
const types = getWindowsPdhTypes(this.koffi)
|
|
464
|
+
this.counterValueType = types.counterValue
|
|
465
|
+
this.counterInfoType = types.counterInfo
|
|
466
|
+
|
|
467
|
+
this.library = this.koffi.load("pdh.dll")
|
|
468
|
+
this.functions = {
|
|
469
|
+
openQuery: this.library.func("uint32_t __stdcall PdhOpenQueryW(const char16_t *szDataSource, uintptr_t dwUserData, _Out_ void **phQuery)"),
|
|
470
|
+
addEnglishCounter: this.library.func("uint32_t __stdcall PdhAddEnglishCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
|
|
471
|
+
addCounter: this.library.func("uint32_t __stdcall PdhAddCounterW(void *hQuery, const char16_t *szFullCounterPath, uintptr_t dwUserData, _Out_ void **phCounter)"),
|
|
472
|
+
collectQueryData: this.library.func("uint32_t __stdcall PdhCollectQueryData(void *hQuery)"),
|
|
473
|
+
getCounterInfo: this.library.func("uint32_t __stdcall PdhGetCounterInfoW(void *hCounter, int bRetrieveExplainText, _Inout_ uint32_t *pdwBufferSize, _Out_ void *lpBuffer)"),
|
|
474
|
+
expandWildCardPath: this.library.func("uint32_t __stdcall PdhExpandWildCardPathW(const char16_t *szDataSource, const char16_t *szWildCardPath, _Out_ char16_t *mszExpandedPathList, _Inout_ uint32_t *pcchPathListLength, uint32_t dwFlags)"),
|
|
475
|
+
getFormattedCounterValue: this.library.func("uint32_t __stdcall PdhGetFormattedCounterValue(void *hCounter, uint32_t dwFormat, _Out_ uint32_t *lpdwType, _Out_ PDH_FMT_COUNTERVALUE *pValue)"),
|
|
476
|
+
closeQuery: this.library.func("uint32_t __stdcall PdhCloseQuery(void *hQuery)")
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
openQuery() {
|
|
481
|
+
const query = [null]
|
|
482
|
+
const status = this.functions.openQuery(null, 0, query)
|
|
483
|
+
if (!isSuccess(status)) {
|
|
484
|
+
throw new Error(`PdhOpenQueryW failed: 0x${statusCode(status).toString(16)}`)
|
|
485
|
+
}
|
|
486
|
+
return query[0]
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
closeQuery(query) {
|
|
490
|
+
if (!query || !this.functions) return
|
|
491
|
+
try {
|
|
492
|
+
this.functions.closeQuery(query)
|
|
493
|
+
} catch (_) {}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
getLocalizedWildcardPath() {
|
|
497
|
+
const query = this.openQuery()
|
|
498
|
+
const counter = [null]
|
|
499
|
+
try {
|
|
500
|
+
let status = this.functions.addEnglishCounter(query, WINDOWS_GPU_PROCESS_COUNTER, 0, counter)
|
|
501
|
+
if (!isSuccess(status)) {
|
|
502
|
+
throw new Error(`PdhAddEnglishCounterW failed: 0x${statusCode(status).toString(16)}`)
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
const bufferSize = [0]
|
|
506
|
+
status = this.functions.getCounterInfo(counter[0], 0, bufferSize, null)
|
|
507
|
+
if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
|
|
508
|
+
throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
|
|
509
|
+
}
|
|
510
|
+
if (bufferSize[0] <= 0) {
|
|
511
|
+
return WINDOWS_GPU_PROCESS_COUNTER
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
const buffer = Buffer.alloc(bufferSize[0])
|
|
515
|
+
status = this.functions.getCounterInfo(counter[0], 0, bufferSize, buffer)
|
|
516
|
+
if (!isSuccess(status)) {
|
|
517
|
+
throw new Error(`PdhGetCounterInfoW failed: 0x${statusCode(status).toString(16)}`)
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
const info = this.koffi.decode(buffer, this.counterInfoType)
|
|
521
|
+
return info && info.szFullPath ? info.szFullPath : WINDOWS_GPU_PROCESS_COUNTER
|
|
522
|
+
} finally {
|
|
523
|
+
this.closeQuery(query)
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
expandWildcardPath(wildcardPath) {
|
|
528
|
+
const charCount = [0]
|
|
529
|
+
let status = this.functions.expandWildCardPath(null, wildcardPath, null, charCount, 0)
|
|
530
|
+
if (isNoDataStatus(status)) {
|
|
531
|
+
return []
|
|
532
|
+
}
|
|
533
|
+
if (!isStatus(status, PDH_MORE_DATA) && !isSuccess(status)) {
|
|
534
|
+
throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
|
|
535
|
+
}
|
|
536
|
+
if (charCount[0] <= 0) {
|
|
537
|
+
return []
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
const buffer = Buffer.alloc(charCount[0] * 2)
|
|
541
|
+
status = this.functions.expandWildCardPath(null, wildcardPath, buffer, charCount, 0)
|
|
542
|
+
if (isNoDataStatus(status)) {
|
|
543
|
+
return []
|
|
544
|
+
}
|
|
545
|
+
if (!isSuccess(status)) {
|
|
546
|
+
throw new Error(`PdhExpandWildCardPathW failed: 0x${statusCode(status).toString(16)}`)
|
|
547
|
+
}
|
|
548
|
+
return decodeWindowsMultiSz(buffer, charCount[0])
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
refreshCounters(force = false) {
|
|
552
|
+
const now = Date.now()
|
|
553
|
+
if (!force && this.query && now - this.lastCounterRefreshAt < this.counterRefreshMs) {
|
|
554
|
+
return
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
const paths = this.expandWildcardPath(this.getLocalizedWildcardPath())
|
|
558
|
+
const query = this.openQuery()
|
|
559
|
+
const counters = []
|
|
560
|
+
try {
|
|
561
|
+
for (const counterPath of paths) {
|
|
562
|
+
const pid = extractPidFromWindowsGpuInstance(counterPath)
|
|
563
|
+
if (!pid) continue
|
|
564
|
+
const counter = [null]
|
|
565
|
+
const status = this.functions.addCounter(query, counterPath, 0, counter)
|
|
566
|
+
if (isSuccess(status) && counter[0]) {
|
|
567
|
+
counters.push({ handle: counter[0], pid })
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
} catch (error) {
|
|
571
|
+
this.closeQuery(query)
|
|
572
|
+
throw error
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
const previousQuery = this.query
|
|
576
|
+
this.query = counters.length > 0 ? query : null
|
|
577
|
+
this.counters = counters
|
|
578
|
+
this.lastCounterRefreshAt = now
|
|
579
|
+
if (this.query !== query) {
|
|
580
|
+
this.closeQuery(query)
|
|
581
|
+
}
|
|
582
|
+
this.closeQuery(previousQuery)
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
readCounterValue(counter) {
|
|
586
|
+
const type = [0]
|
|
587
|
+
const buffer = Buffer.alloc(this.koffi.sizeof(this.counterValueType))
|
|
588
|
+
const status = this.functions.getFormattedCounterValue(counter.handle, PDH_FMT_LARGE, type, buffer)
|
|
589
|
+
if (isNoDataStatus(status)) {
|
|
590
|
+
return null
|
|
591
|
+
}
|
|
592
|
+
if (!isSuccess(status)) {
|
|
593
|
+
return null
|
|
594
|
+
}
|
|
595
|
+
const value = this.koffi.decode(buffer, this.counterValueType)
|
|
596
|
+
if (!value || !isSuccess(value.CStatus)) {
|
|
597
|
+
return null
|
|
598
|
+
}
|
|
599
|
+
return parseMemoryToBytes(value.largeValue)
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
collect(pids) {
|
|
603
|
+
this.init()
|
|
604
|
+
this.refreshCounters(false)
|
|
605
|
+
if (!this.query || this.counters.length === 0) {
|
|
606
|
+
return new Map()
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const status = this.functions.collectQueryData(this.query)
|
|
610
|
+
if (isNoDataStatus(status)) {
|
|
611
|
+
return new Map()
|
|
612
|
+
}
|
|
613
|
+
if (!isSuccess(status)) {
|
|
614
|
+
throw new Error(`PdhCollectQueryData failed: 0x${statusCode(status).toString(16)}`)
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const targetPids = normalizePidSet(pids)
|
|
618
|
+
const targetSet = targetPids.length > 0 ? new Set(targetPids) : null
|
|
619
|
+
const processes = new Map()
|
|
620
|
+
for (const counter of this.counters) {
|
|
621
|
+
if (!counter || (targetSet && !targetSet.has(counter.pid))) continue
|
|
622
|
+
addGpuProcess(processes, counter.pid, this.readCounterValue(counter))
|
|
623
|
+
}
|
|
624
|
+
return processes
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
stop() {
|
|
628
|
+
this.closeQuery(this.query)
|
|
629
|
+
this.query = null
|
|
630
|
+
this.counters = []
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
class NvmlGpuMemoryClient {
|
|
635
|
+
constructor(options = {}) {
|
|
636
|
+
this.koffi = options.koffi || loadKoffi()
|
|
637
|
+
this.library = null
|
|
638
|
+
this.initialized = false
|
|
639
|
+
this.processInfoV1 = null
|
|
640
|
+
this.processInfoV2 = null
|
|
641
|
+
this.functions = null
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
init() {
|
|
645
|
+
if (this.initialized) return
|
|
646
|
+
if (!this.koffi) {
|
|
647
|
+
throw new Error("koffi unavailable")
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
const types = getNvmlTypes(this.koffi)
|
|
651
|
+
this.processInfoV1 = types.processInfoV1
|
|
652
|
+
this.processInfoV2 = types.processInfoV2
|
|
653
|
+
|
|
654
|
+
this.library = loadFirstLibrary(this.koffi, [
|
|
655
|
+
process.env.NVIDIA_ML,
|
|
656
|
+
"libnvidia-ml.so.1",
|
|
657
|
+
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
|
|
658
|
+
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
|
|
659
|
+
"/usr/lib64/libnvidia-ml.so.1",
|
|
660
|
+
"/usr/local/nvidia/lib64/libnvidia-ml.so.1"
|
|
661
|
+
])
|
|
662
|
+
this.functions = {
|
|
663
|
+
init: optionalFunction(this.library, [
|
|
664
|
+
"int nvmlInit_v2(void)",
|
|
665
|
+
"int nvmlInit(void)"
|
|
666
|
+
]),
|
|
667
|
+
shutdown: optionalFunction(this.library, [
|
|
668
|
+
"int nvmlShutdown(void)"
|
|
669
|
+
]),
|
|
670
|
+
getCount: optionalFunction(this.library, [
|
|
671
|
+
"int nvmlDeviceGetCount_v2(_Out_ uint32_t *deviceCount)",
|
|
672
|
+
"int nvmlDeviceGetCount(_Out_ uint32_t *deviceCount)"
|
|
673
|
+
]),
|
|
674
|
+
getHandleByIndex: optionalFunction(this.library, [
|
|
675
|
+
"int nvmlDeviceGetHandleByIndex_v2(uint32_t index, _Out_ void **device)",
|
|
676
|
+
"int nvmlDeviceGetHandleByIndex(uint32_t index, _Out_ void **device)"
|
|
677
|
+
]),
|
|
678
|
+
compute: this.pickProcessFunction("nvmlDeviceGetComputeRunningProcesses"),
|
|
679
|
+
graphics: this.pickProcessFunction("nvmlDeviceGetGraphicsRunningProcesses"),
|
|
680
|
+
mps: this.pickProcessFunction("nvmlDeviceGetMPSComputeRunningProcesses")
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
if (!this.functions.init || !this.functions.getCount || !this.functions.getHandleByIndex) {
|
|
684
|
+
throw new Error("NVML process API unavailable")
|
|
685
|
+
}
|
|
686
|
+
const status = this.functions.init()
|
|
687
|
+
if (status !== NVML_SUCCESS) {
|
|
688
|
+
throw new Error(`nvmlInit failed: ${status}`)
|
|
689
|
+
}
|
|
690
|
+
this.initialized = true
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
pickProcessFunction(baseName) {
|
|
694
|
+
const candidates = [
|
|
695
|
+
{ suffix: "_v3", type: () => this.processInfoV2 },
|
|
696
|
+
{ suffix: "_v2", type: () => this.processInfoV2 },
|
|
697
|
+
{ suffix: "", type: () => this.processInfoV1 }
|
|
698
|
+
]
|
|
699
|
+
for (const candidate of candidates) {
|
|
700
|
+
const typeName = candidate.type() === this.processInfoV2 ? "nvmlProcessInfo_v2_t" : "nvmlProcessInfo_v1_t"
|
|
701
|
+
const func = optionalFunction(this.library, [
|
|
702
|
+
`int ${baseName}${candidate.suffix}(void *device, _Inout_ uint32_t *infoCount, _Out_ ${typeName} *infos)`
|
|
703
|
+
])
|
|
704
|
+
if (func) {
|
|
705
|
+
return { func, type: candidate.type() }
|
|
706
|
+
}
|
|
707
|
+
}
|
|
179
708
|
return null
|
|
180
709
|
}
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
710
|
+
|
|
711
|
+
getDeviceHandles() {
|
|
712
|
+
const count = [0]
|
|
713
|
+
const status = this.functions.getCount(count)
|
|
714
|
+
if (status !== NVML_SUCCESS) {
|
|
715
|
+
throw new Error(`nvmlDeviceGetCount failed: ${status}`)
|
|
184
716
|
}
|
|
717
|
+
const handles = []
|
|
718
|
+
for (let i = 0; i < count[0]; i += 1) {
|
|
719
|
+
const handle = [null]
|
|
720
|
+
const handleStatus = this.functions.getHandleByIndex(i, handle)
|
|
721
|
+
if (handleStatus === NVML_SUCCESS && handle[0]) {
|
|
722
|
+
handles.push(handle[0])
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
return handles
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
collectProcessList(device, entry) {
|
|
729
|
+
if (!entry || !entry.func) return []
|
|
730
|
+
|
|
731
|
+
let count = [0]
|
|
732
|
+
let status = entry.func(device, count, null)
|
|
733
|
+
if (status === NVML_SUCCESS && count[0] === 0) {
|
|
734
|
+
return []
|
|
735
|
+
}
|
|
736
|
+
if (status !== NVML_SUCCESS && status !== NVML_ERROR_INSUFFICIENT_SIZE) {
|
|
737
|
+
return []
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
let capacity = Math.max(1, count[0] + 8)
|
|
741
|
+
for (let attempt = 0; attempt < 2; attempt += 1) {
|
|
742
|
+
count = [capacity]
|
|
743
|
+
const buffer = Buffer.alloc(this.koffi.sizeof(entry.type) * capacity)
|
|
744
|
+
status = entry.func(device, count, buffer)
|
|
745
|
+
if (status === NVML_SUCCESS) {
|
|
746
|
+
return this.koffi.decode(buffer, entry.type, Math.min(count[0], capacity))
|
|
747
|
+
}
|
|
748
|
+
if (status !== NVML_ERROR_INSUFFICIENT_SIZE || count[0] <= capacity) {
|
|
749
|
+
return []
|
|
750
|
+
}
|
|
751
|
+
capacity = count[0] + 8
|
|
752
|
+
}
|
|
753
|
+
return []
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
collect(pids = null) {
|
|
757
|
+
this.init()
|
|
758
|
+
const processes = new Map()
|
|
759
|
+
for (const device of this.getDeviceHandles()) {
|
|
760
|
+
const deviceProcesses = new Map()
|
|
761
|
+
for (const entry of [this.functions.compute, this.functions.graphics, this.functions.mps]) {
|
|
762
|
+
for (const processInfo of this.collectProcessList(device, entry)) {
|
|
763
|
+
if (!processInfo) continue
|
|
764
|
+
const pid = normalizePid(processInfo.pid)
|
|
765
|
+
if (!pid) continue
|
|
766
|
+
if (typeof processInfo.usedGpuMemory === "bigint" && processInfo.usedGpuMemory === NVML_VALUE_NOT_AVAILABLE) {
|
|
767
|
+
continue
|
|
768
|
+
}
|
|
769
|
+
const bytes = parseMemoryToBytes(processInfo.usedGpuMemory)
|
|
770
|
+
mergeGpuProcess(deviceProcesses, pid, bytes)
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
for (const entry of deviceProcesses.values()) {
|
|
774
|
+
addGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
return filterProcessMap(processes, pids)
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
stop() {
|
|
781
|
+
if (this.initialized && this.functions && this.functions.shutdown) {
|
|
782
|
+
try {
|
|
783
|
+
this.functions.shutdown()
|
|
784
|
+
} catch (_) {}
|
|
785
|
+
}
|
|
786
|
+
this.initialized = false
|
|
185
787
|
}
|
|
186
|
-
return null
|
|
187
788
|
}
|
|
188
789
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
790
|
+
class AmdSmiGpuMemoryClient {
|
|
791
|
+
constructor(options = {}) {
|
|
792
|
+
this.koffi = options.koffi || loadKoffi()
|
|
793
|
+
this.library = null
|
|
794
|
+
this.initialized = false
|
|
795
|
+
this.procInfoType = null
|
|
796
|
+
this.functions = null
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
init() {
|
|
800
|
+
if (this.initialized) return
|
|
801
|
+
if (!this.koffi) {
|
|
802
|
+
throw new Error("koffi unavailable")
|
|
193
803
|
}
|
|
194
|
-
|
|
804
|
+
|
|
805
|
+
this.procInfoType = getAmdSmiTypes(this.koffi).procInfo
|
|
806
|
+
|
|
807
|
+
this.library = loadFirstLibrary(this.koffi, [
|
|
808
|
+
process.env.AMD_SMI_LIBRARY,
|
|
809
|
+
...rocmLibraryCandidates("libamd_smi.so")
|
|
810
|
+
])
|
|
811
|
+
this.functions = {
|
|
812
|
+
init: this.library.func("int amdsmi_init(uint64_t init_flags)"),
|
|
813
|
+
shutdown: optionalFunction(this.library, [
|
|
814
|
+
"int amdsmi_shut_down(void)"
|
|
815
|
+
]),
|
|
816
|
+
getSocketHandles: this.library.func("int amdsmi_get_socket_handles(_Inout_ uint32_t *socket_count, _Out_ void **socket_handles)"),
|
|
817
|
+
getProcessorHandles: this.library.func("int amdsmi_get_processor_handles(void *socket_handle, _Inout_ uint32_t *processor_count, _Out_ void **processor_handles)"),
|
|
818
|
+
getProcessList: this.library.func("int amdsmi_get_gpu_process_list(void *processor_handle, _Inout_ uint32_t *max_processes, _Out_ amdsmi_proc_info_t *list)")
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
const status = this.functions.init(AMDSMI_INIT_AMD_GPUS)
|
|
822
|
+
if (status !== 0) {
|
|
823
|
+
throw new Error(`amdsmi_init failed: ${status}`)
|
|
824
|
+
}
|
|
825
|
+
this.initialized = true
|
|
195
826
|
}
|
|
196
|
-
|
|
197
|
-
|
|
827
|
+
|
|
828
|
+
readPointerArray(countFunction) {
|
|
829
|
+
let count = [0]
|
|
830
|
+
let status = countFunction(count, null)
|
|
831
|
+
if (status !== 0 && count[0] === 0) {
|
|
832
|
+
return []
|
|
833
|
+
}
|
|
834
|
+
if (count[0] <= 0) {
|
|
835
|
+
return []
|
|
836
|
+
}
|
|
837
|
+
const pointerSize = this.koffi.sizeof("void *")
|
|
838
|
+
const buffer = Buffer.alloc(pointerSize * count[0])
|
|
839
|
+
status = countFunction(count, buffer)
|
|
840
|
+
if (status !== 0) {
|
|
841
|
+
return []
|
|
842
|
+
}
|
|
843
|
+
return this.koffi.decode(buffer, "uintptr_t", count[0]).filter(Boolean)
|
|
198
844
|
}
|
|
199
845
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
846
|
+
getProcessorHandles() {
|
|
847
|
+
const sockets = this.readPointerArray((count, buffer) => {
|
|
848
|
+
return this.functions.getSocketHandles(count, buffer)
|
|
849
|
+
})
|
|
850
|
+
const processors = []
|
|
851
|
+
for (const socket of sockets) {
|
|
852
|
+
processors.push(...this.readPointerArray((count, buffer) => {
|
|
853
|
+
return this.functions.getProcessorHandles(socket, count, buffer)
|
|
854
|
+
}))
|
|
855
|
+
}
|
|
856
|
+
return processors
|
|
857
|
+
}
|
|
209
858
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
859
|
+
collectProcessorProcesses(processor) {
|
|
860
|
+
let count = [0]
|
|
861
|
+
let status = this.functions.getProcessList(processor, count, null)
|
|
862
|
+
if (status !== 0 && count[0] === 0) {
|
|
863
|
+
return []
|
|
213
864
|
}
|
|
865
|
+
if (count[0] <= 0) {
|
|
866
|
+
return []
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
let capacity = count[0]
|
|
870
|
+
for (let attempt = 0; attempt < 2; attempt += 1) {
|
|
871
|
+
count = [capacity]
|
|
872
|
+
const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * capacity)
|
|
873
|
+
status = this.functions.getProcessList(processor, count, buffer)
|
|
874
|
+
if (status === 0) {
|
|
875
|
+
return this.koffi.decode(buffer, this.procInfoType, Math.min(count[0], capacity))
|
|
876
|
+
}
|
|
877
|
+
if (count[0] <= capacity) {
|
|
878
|
+
return []
|
|
879
|
+
}
|
|
880
|
+
capacity = count[0]
|
|
881
|
+
}
|
|
882
|
+
return []
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
collect(pids = null) {
|
|
886
|
+
this.init()
|
|
887
|
+
const processes = new Map()
|
|
888
|
+
for (const processor of this.getProcessorHandles()) {
|
|
889
|
+
for (const entry of this.collectProcessorProcesses(processor)) {
|
|
890
|
+
if (!entry) continue
|
|
891
|
+
const bytes = parseMemoryToBytes(entry.memory_usage && entry.memory_usage.vram_mem)
|
|
892
|
+
addGpuProcess(processes, entry.pid, bytes)
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
return filterProcessMap(processes, pids)
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
stop() {
|
|
899
|
+
if (this.initialized && this.functions && this.functions.shutdown) {
|
|
900
|
+
try {
|
|
901
|
+
this.functions.shutdown()
|
|
902
|
+
} catch (_) {}
|
|
903
|
+
}
|
|
904
|
+
this.initialized = false
|
|
214
905
|
}
|
|
215
|
-
return processes
|
|
216
906
|
}
|
|
217
907
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
908
|
+
class RocmSmiGpuMemoryClient {
|
|
909
|
+
constructor(options = {}) {
|
|
910
|
+
this.koffi = options.koffi || loadKoffi()
|
|
911
|
+
this.library = null
|
|
912
|
+
this.initialized = false
|
|
913
|
+
this.procInfoType = null
|
|
914
|
+
this.functions = null
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
init() {
|
|
918
|
+
if (this.initialized) return
|
|
919
|
+
if (!this.koffi) {
|
|
920
|
+
throw new Error("koffi unavailable")
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
this.procInfoType = getRocmSmiTypes(this.koffi).procInfo
|
|
924
|
+
|
|
925
|
+
this.library = loadFirstLibrary(this.koffi, [
|
|
926
|
+
process.env.ROCM_SMI_LIBRARY,
|
|
927
|
+
...rocmLibraryCandidates("librocm_smi64.so")
|
|
928
|
+
])
|
|
929
|
+
this.functions = {
|
|
930
|
+
init: this.library.func("int rsmi_init(uint64_t init_flags)"),
|
|
931
|
+
shutdown: optionalFunction(this.library, [
|
|
932
|
+
"int rsmi_shut_down(void)"
|
|
933
|
+
]),
|
|
934
|
+
getProcessInfo: this.library.func("int rsmi_compute_process_info_get(_Out_ rsmi_process_info_t *procs, _Inout_ uint32_t *num_items)")
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
const status = this.functions.init(RSMI_INIT_DEFAULT)
|
|
938
|
+
if (status !== 0) {
|
|
939
|
+
throw new Error(`rsmi_init failed: ${status}`)
|
|
940
|
+
}
|
|
941
|
+
this.initialized = true
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
collect(pids = null) {
|
|
945
|
+
this.init()
|
|
946
|
+
let count = [0]
|
|
947
|
+
let status = this.functions.getProcessInfo(null, count)
|
|
948
|
+
if (status !== 0 && count[0] === 0) {
|
|
949
|
+
return new Map()
|
|
950
|
+
}
|
|
951
|
+
if (count[0] <= 0) {
|
|
952
|
+
return new Map()
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
const buffer = Buffer.alloc(this.koffi.sizeof(this.procInfoType) * count[0])
|
|
956
|
+
status = this.functions.getProcessInfo(buffer, count)
|
|
957
|
+
if (status !== 0) {
|
|
958
|
+
return new Map()
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
const processes = new Map()
|
|
962
|
+
for (const entry of this.koffi.decode(buffer, this.procInfoType, count[0])) {
|
|
963
|
+
if (!entry) continue
|
|
964
|
+
addGpuProcess(processes, entry.process_id, parseMemoryToBytes(entry.vram_usage))
|
|
965
|
+
}
|
|
966
|
+
return filterProcessMap(processes, pids)
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
stop() {
|
|
970
|
+
if (this.initialized && this.functions && this.functions.shutdown) {
|
|
971
|
+
try {
|
|
972
|
+
this.functions.shutdown()
|
|
973
|
+
} catch (_) {}
|
|
974
|
+
}
|
|
975
|
+
this.initialized = false
|
|
976
|
+
}
|
|
221
977
|
}
|
|
222
978
|
|
|
223
979
|
class GpuSampler {
|
|
@@ -225,56 +981,19 @@ class GpuSampler {
|
|
|
225
981
|
this.kernel = options.kernel || null
|
|
226
982
|
this.platform = options.platform || (this.kernel && this.kernel.platform) || os.platform()
|
|
227
983
|
this.ttlMs = options.ttlMs || DEFAULT_GPU_TTL_MS
|
|
228
|
-
this.
|
|
229
|
-
this.
|
|
984
|
+
this.procRoot = options.procRoot || "/proc"
|
|
985
|
+
this.drmFdinfoMaxPids = options.drmFdinfoMaxPids || DEFAULT_DRM_FDINFO_MAX_PIDS
|
|
986
|
+
this.drmFdinfoMaxFdsPerPid = options.drmFdinfoMaxFdsPerPid || DEFAULT_DRM_FDINFO_MAX_FDS_PER_PID
|
|
987
|
+
this.windowsPdhClient = options.windowsPdhClient || null
|
|
988
|
+
this.nvmlClient = options.nvmlClient || null
|
|
989
|
+
this.amdSmiClient = options.amdSmiClient || null
|
|
990
|
+
this.rocmSmiClient = options.rocmSmiClient || null
|
|
230
991
|
this.current = null
|
|
992
|
+
this.currentCacheKey = null
|
|
231
993
|
this.inFlight = null
|
|
232
|
-
this.
|
|
233
|
-
this.windowsCounterInFlight = null
|
|
994
|
+
this.inFlightCacheKey = null
|
|
234
995
|
this.providerBackoff = new Map()
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
nvidiaCandidates() {
|
|
238
|
-
const candidates = [
|
|
239
|
-
process.env.NVIDIA_SMI,
|
|
240
|
-
"nvidia-smi",
|
|
241
|
-
...getPinokioCondaCandidates(this.kernel, ["nvidia-smi"])
|
|
242
|
-
]
|
|
243
|
-
if (this.platform === "win32") {
|
|
244
|
-
candidates.push(
|
|
245
|
-
"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe",
|
|
246
|
-
"C:\\Windows\\System32\\nvidia-smi.exe"
|
|
247
|
-
)
|
|
248
|
-
} else if (this.platform === "linux") {
|
|
249
|
-
candidates.push(
|
|
250
|
-
"/usr/bin/nvidia-smi",
|
|
251
|
-
"/usr/local/bin/nvidia-smi",
|
|
252
|
-
"/usr/local/nvidia/bin/nvidia-smi",
|
|
253
|
-
"/usr/local/cuda/bin/nvidia-smi"
|
|
254
|
-
)
|
|
255
|
-
}
|
|
256
|
-
return executableCandidates(candidates)
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
windowsGpuCounterCandidates() {
|
|
260
|
-
return executableCandidates([
|
|
261
|
-
process.env.TYPEPERF,
|
|
262
|
-
"typeperf",
|
|
263
|
-
"C:\\Windows\\System32\\typeperf.exe",
|
|
264
|
-
"C:\\Windows\\Sysnative\\typeperf.exe"
|
|
265
|
-
])
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
amdCandidates() {
|
|
269
|
-
const candidates = [
|
|
270
|
-
process.env.AMD_SMI,
|
|
271
|
-
"amd-smi",
|
|
272
|
-
...getPinokioCondaCandidates(this.kernel, ["amd-smi"])
|
|
273
|
-
]
|
|
274
|
-
if (this.platform === "linux") {
|
|
275
|
-
candidates.push("/opt/rocm/bin/amd-smi", "/usr/bin/amd-smi", "/usr/local/bin/amd-smi")
|
|
276
|
-
}
|
|
277
|
-
return executableCandidates(candidates)
|
|
996
|
+
this.providerLogBackoff = new Map()
|
|
278
997
|
}
|
|
279
998
|
|
|
280
999
|
isBackedOff(provider) {
|
|
@@ -286,139 +1005,161 @@ class GpuSampler {
|
|
|
286
1005
|
this.providerBackoff.set(provider, Date.now() + ms)
|
|
287
1006
|
}
|
|
288
1007
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
1008
|
+
logProviderFailure(provider, error, pids, fallbackMessage = "GPU provider unavailable", ms = 60000) {
|
|
1009
|
+
const now = Date.now()
|
|
1010
|
+
const until = this.providerLogBackoff.get(provider) || 0
|
|
1011
|
+
if (now < until) return
|
|
1012
|
+
this.providerLogBackoff.set(provider, now + ms)
|
|
1013
|
+
|
|
1014
|
+
const summary = {
|
|
1015
|
+
provider,
|
|
1016
|
+
platform: this.platform,
|
|
1017
|
+
pid_count: normalizePidSet(pids).length,
|
|
1018
|
+
error: error && error.message ? error.message : fallbackMessage
|
|
292
1019
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
const { stdout } = await execFileText(command, [
|
|
297
|
-
"\\GPU Process Memory(*)\\Dedicated Usage",
|
|
298
|
-
"-sc",
|
|
299
|
-
"1"
|
|
300
|
-
], { timeoutMs: Math.max(this.timeoutMs, 3000) })
|
|
301
|
-
return {
|
|
302
|
-
provider: "windows-gpu-process-memory",
|
|
303
|
-
processes: parseWindowsGpuProcessMemoryCsv(stdout),
|
|
304
|
-
error: null,
|
|
305
|
-
collectedAt: Date.now()
|
|
306
|
-
}
|
|
307
|
-
} catch (error) {
|
|
308
|
-
lastError = error
|
|
309
|
-
if (error && error.code === "ENOENT") {
|
|
310
|
-
continue
|
|
311
|
-
}
|
|
312
|
-
break
|
|
313
|
-
}
|
|
1020
|
+
const code = error && (error.code || error.errno || error.status)
|
|
1021
|
+
if (code != null) {
|
|
1022
|
+
summary.code = String(code)
|
|
314
1023
|
}
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
1024
|
+
try {
|
|
1025
|
+
console.warn("[resource-usage:gpu] provider failed", summary)
|
|
1026
|
+
} catch (_) {}
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
getWindowsPdhClient() {
|
|
1030
|
+
if (!this.windowsPdhClient) {
|
|
1031
|
+
this.windowsPdhClient = new WindowsPdhGpuMemoryClient()
|
|
321
1032
|
}
|
|
1033
|
+
return this.windowsPdhClient
|
|
322
1034
|
}
|
|
323
1035
|
|
|
324
|
-
|
|
325
|
-
if (this.
|
|
326
|
-
|
|
1036
|
+
getNvmlClient() {
|
|
1037
|
+
if (!this.nvmlClient) {
|
|
1038
|
+
this.nvmlClient = new NvmlGpuMemoryClient()
|
|
327
1039
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
1040
|
+
return this.nvmlClient
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
getAmdSmiClient() {
|
|
1044
|
+
if (!this.amdSmiClient) {
|
|
1045
|
+
this.amdSmiClient = new AmdSmiGpuMemoryClient()
|
|
331
1046
|
}
|
|
332
|
-
|
|
333
|
-
|
|
1047
|
+
return this.amdSmiClient
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
getRocmSmiClient() {
|
|
1051
|
+
if (!this.rocmSmiClient) {
|
|
1052
|
+
this.rocmSmiClient = new RocmSmiGpuMemoryClient()
|
|
334
1053
|
}
|
|
335
|
-
|
|
336
|
-
if (result && !result.error) {
|
|
337
|
-
this.windowsCounterCurrent = result
|
|
338
|
-
}
|
|
339
|
-
return result
|
|
340
|
-
}).finally(() => {
|
|
341
|
-
this.windowsCounterInFlight = null
|
|
342
|
-
})
|
|
343
|
-
return this.windowsCounterInFlight
|
|
1054
|
+
return this.rocmSmiClient
|
|
344
1055
|
}
|
|
345
1056
|
|
|
346
|
-
async
|
|
347
|
-
if (this.isBackedOff("
|
|
1057
|
+
async collectWindowsPdh(pids) {
|
|
1058
|
+
if (this.platform !== "win32" || this.isBackedOff("windows-pdh")) {
|
|
348
1059
|
return null
|
|
349
1060
|
}
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
} catch (error) {
|
|
364
|
-
lastError = error
|
|
365
|
-
if (error && error.code === "ENOENT") {
|
|
366
|
-
continue
|
|
367
|
-
}
|
|
368
|
-
break
|
|
1061
|
+
try {
|
|
1062
|
+
return {
|
|
1063
|
+
provider: "windows-pdh",
|
|
1064
|
+
processes: this.getWindowsPdhClient().collect(pids),
|
|
1065
|
+
error: null
|
|
1066
|
+
}
|
|
1067
|
+
} catch (error) {
|
|
1068
|
+
this.logProviderFailure("windows-pdh", error, pids, "Windows PDH unavailable")
|
|
1069
|
+
this.backoff("windows-pdh", 60000)
|
|
1070
|
+
return {
|
|
1071
|
+
provider: "windows-pdh",
|
|
1072
|
+
processes: new Map(),
|
|
1073
|
+
error: error && error.message ? error.message : "Windows PDH unavailable"
|
|
369
1074
|
}
|
|
370
|
-
}
|
|
371
|
-
this.backoff("nvidia", 60000)
|
|
372
|
-
return {
|
|
373
|
-
provider: "nvidia-smi",
|
|
374
|
-
processes: new Map(),
|
|
375
|
-
error: lastError && lastError.message ? lastError.message : "nvidia-smi unavailable"
|
|
376
1075
|
}
|
|
377
1076
|
}
|
|
378
1077
|
|
|
379
|
-
async
|
|
380
|
-
if (this.platform !== "linux" || this.isBackedOff("
|
|
1078
|
+
async collectLinuxDrmFdinfo(pids) {
|
|
1079
|
+
if (this.platform !== "linux" || this.isBackedOff("linux-drm-fdinfo") || pids == null) {
|
|
381
1080
|
return null
|
|
382
1081
|
}
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
const { stdout } = await execFileText(command, ["process", "--json", "-G"], { timeoutMs: this.timeoutMs })
|
|
387
|
-
return {
|
|
388
|
-
provider: "amd-smi",
|
|
389
|
-
processes: parseAmdJson(stdout),
|
|
390
|
-
error: null
|
|
391
|
-
}
|
|
392
|
-
} catch (error) {
|
|
393
|
-
lastError = error
|
|
394
|
-
if (error && error.code === "ENOENT") {
|
|
395
|
-
continue
|
|
396
|
-
}
|
|
397
|
-
break
|
|
398
|
-
}
|
|
1082
|
+
const targetPids = normalizePidSet(pids)
|
|
1083
|
+
if (targetPids.length === 0) {
|
|
1084
|
+
return null
|
|
399
1085
|
}
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
1086
|
+
try {
|
|
1087
|
+
const processes = await collectLinuxDrmFdinfoProcesses(targetPids, {
|
|
1088
|
+
procRoot: this.procRoot,
|
|
1089
|
+
maxPids: this.drmFdinfoMaxPids,
|
|
1090
|
+
maxFdsPerPid: this.drmFdinfoMaxFdsPerPid
|
|
1091
|
+
})
|
|
1092
|
+
if (processes.size === 0) {
|
|
1093
|
+
return null
|
|
1094
|
+
}
|
|
1095
|
+
return {
|
|
1096
|
+
provider: "linux-drm-fdinfo",
|
|
1097
|
+
processes,
|
|
1098
|
+
error: null
|
|
1099
|
+
}
|
|
1100
|
+
} catch (error) {
|
|
1101
|
+
this.logProviderFailure("linux-drm-fdinfo", error, pids, "Linux DRM fdinfo unavailable")
|
|
1102
|
+
this.backoff("linux-drm-fdinfo", 60000)
|
|
1103
|
+
return {
|
|
1104
|
+
provider: "linux-drm-fdinfo",
|
|
1105
|
+
processes: new Map(),
|
|
1106
|
+
error: error && error.message ? error.message : "Linux DRM fdinfo unavailable"
|
|
1107
|
+
}
|
|
405
1108
|
}
|
|
406
1109
|
}
|
|
407
1110
|
|
|
408
|
-
async
|
|
409
|
-
|
|
1111
|
+
async collectNvml(pids) {
|
|
1112
|
+
if (this.platform !== "linux" || this.isBackedOff("linux-nvml")) {
|
|
1113
|
+
return null
|
|
1114
|
+
}
|
|
1115
|
+
try {
|
|
1116
|
+
return {
|
|
1117
|
+
provider: "linux-nvml",
|
|
1118
|
+
processes: this.getNvmlClient().collect(pids),
|
|
1119
|
+
error: null
|
|
1120
|
+
}
|
|
1121
|
+
} catch (error) {
|
|
1122
|
+
this.logProviderFailure("linux-nvml", error, pids, "Linux NVML unavailable")
|
|
1123
|
+
this.backoff("linux-nvml", 60000)
|
|
1124
|
+
return null
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
410
1127
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
} else {
|
|
415
|
-
const nvidia = await this.collectNvidia()
|
|
416
|
-
if (nvidia) results.push(nvidia)
|
|
1128
|
+
async collectAmdSmi(pids) {
|
|
1129
|
+
if (this.platform !== "linux" || this.isBackedOff("linux-amdsmi")) {
|
|
1130
|
+
return null
|
|
417
1131
|
}
|
|
1132
|
+
try {
|
|
1133
|
+
return {
|
|
1134
|
+
provider: "linux-amdsmi",
|
|
1135
|
+
processes: this.getAmdSmiClient().collect(pids),
|
|
1136
|
+
error: null
|
|
1137
|
+
}
|
|
1138
|
+
} catch (error) {
|
|
1139
|
+
this.logProviderFailure("linux-amdsmi", error, pids, "Linux AMD SMI unavailable")
|
|
1140
|
+
this.backoff("linux-amdsmi", 60000)
|
|
1141
|
+
return null
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
418
1144
|
|
|
419
|
-
|
|
420
|
-
if (
|
|
1145
|
+
async collectRocmSmi(pids) {
|
|
1146
|
+
if (this.platform !== "linux" || this.isBackedOff("linux-rocm-smi")) {
|
|
1147
|
+
return null
|
|
1148
|
+
}
|
|
1149
|
+
try {
|
|
1150
|
+
return {
|
|
1151
|
+
provider: "linux-rocm-smi",
|
|
1152
|
+
processes: this.getRocmSmiClient().collect(pids),
|
|
1153
|
+
error: null
|
|
1154
|
+
}
|
|
1155
|
+
} catch (error) {
|
|
1156
|
+
this.logProviderFailure("linux-rocm-smi", error, pids, "Linux ROCm SMI unavailable")
|
|
1157
|
+
this.backoff("linux-rocm-smi", 60000)
|
|
1158
|
+
return null
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
421
1161
|
|
|
1162
|
+
mergeResults(results) {
|
|
422
1163
|
const processes = new Map()
|
|
423
1164
|
const providers = []
|
|
424
1165
|
const errors = []
|
|
@@ -430,6 +1171,53 @@ class GpuSampler {
|
|
|
430
1171
|
mergeGpuProcess(processes, entry.pid, entry.usedGpuMemoryBytes)
|
|
431
1172
|
}
|
|
432
1173
|
}
|
|
1174
|
+
return { processes, providers, errors }
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
async collect(pids = null) {
|
|
1178
|
+
if (this.platform === "darwin") {
|
|
1179
|
+
return {
|
|
1180
|
+
available: false,
|
|
1181
|
+
stale: false,
|
|
1182
|
+
collectedAt: Date.now(),
|
|
1183
|
+
providers: [],
|
|
1184
|
+
processes: new Map(),
|
|
1185
|
+
errors: []
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
const results = []
|
|
1190
|
+
if (this.platform === "win32") {
|
|
1191
|
+
const windowsPdh = await this.collectWindowsPdh(pids)
|
|
1192
|
+
if (windowsPdh) results.push(windowsPdh)
|
|
1193
|
+
} else if (this.platform === "linux") {
|
|
1194
|
+
const linuxDrmFdinfo = await this.collectLinuxDrmFdinfo(pids)
|
|
1195
|
+
if (linuxDrmFdinfo) results.push(linuxDrmFdinfo)
|
|
1196
|
+
|
|
1197
|
+
let merged = this.mergeResults(results)
|
|
1198
|
+
const covered = coveredPids(merged.processes)
|
|
1199
|
+
|
|
1200
|
+
if (hasUncoveredTarget(pids, covered)) {
|
|
1201
|
+
const nvml = await this.collectNvml(pids)
|
|
1202
|
+
if (nvml) results.push(nvml)
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
merged = this.mergeResults(results)
|
|
1206
|
+
const afterNvmlCovered = coveredPids(merged.processes)
|
|
1207
|
+
if (hasUncoveredTarget(pids, afterNvmlCovered)) {
|
|
1208
|
+
const amdSmi = await this.collectAmdSmi(pids)
|
|
1209
|
+
if (amdSmi) results.push(amdSmi)
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
merged = this.mergeResults(results)
|
|
1213
|
+
const afterAmdCovered = coveredPids(merged.processes)
|
|
1214
|
+
if (hasUncoveredTarget(pids, afterAmdCovered)) {
|
|
1215
|
+
const rocmSmi = await this.collectRocmSmi(pids)
|
|
1216
|
+
if (rocmSmi) results.push(rocmSmi)
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
const { processes, providers, errors } = this.mergeResults(results)
|
|
433
1221
|
return {
|
|
434
1222
|
available: providers.length > 0 && errors.length < providers.length,
|
|
435
1223
|
stale: false,
|
|
@@ -440,18 +1228,22 @@ class GpuSampler {
|
|
|
440
1228
|
}
|
|
441
1229
|
}
|
|
442
1230
|
|
|
443
|
-
async getSnapshot() {
|
|
1231
|
+
async getSnapshot(pids = null) {
|
|
444
1232
|
const now = Date.now()
|
|
445
|
-
|
|
1233
|
+
const cacheKey = this.platform === "darwin" ? "" : normalizePidSet(pids).join(",")
|
|
1234
|
+
if (this.current && this.currentCacheKey === cacheKey && now - this.current.collectedAt < this.ttlMs) {
|
|
446
1235
|
return this.current
|
|
447
1236
|
}
|
|
448
|
-
if (this.inFlight) {
|
|
1237
|
+
if (this.inFlight && this.inFlightCacheKey === cacheKey) {
|
|
449
1238
|
return this.inFlight
|
|
450
1239
|
}
|
|
451
|
-
this.
|
|
1240
|
+
this.inFlightCacheKey = cacheKey
|
|
1241
|
+
this.inFlight = this.collect(pids).then((snapshot) => {
|
|
452
1242
|
this.current = snapshot
|
|
1243
|
+
this.currentCacheKey = cacheKey
|
|
453
1244
|
return snapshot
|
|
454
1245
|
}).catch((error) => {
|
|
1246
|
+
this.logProviderFailure("gpu", error, pids, "GPU sampling unavailable")
|
|
455
1247
|
if (this.current) {
|
|
456
1248
|
return { ...this.current, stale: true }
|
|
457
1249
|
}
|
|
@@ -465,9 +1257,18 @@ class GpuSampler {
|
|
|
465
1257
|
}
|
|
466
1258
|
}).finally(() => {
|
|
467
1259
|
this.inFlight = null
|
|
1260
|
+
this.inFlightCacheKey = null
|
|
468
1261
|
})
|
|
469
1262
|
return this.inFlight
|
|
470
1263
|
}
|
|
1264
|
+
|
|
1265
|
+
stop() {
|
|
1266
|
+
for (const client of [this.windowsPdhClient, this.nvmlClient, this.amdSmiClient, this.rocmSmiClient]) {
|
|
1267
|
+
if (client && typeof client.stop === "function") {
|
|
1268
|
+
client.stop()
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
471
1272
|
}
|
|
472
1273
|
|
|
473
1274
|
function sumGpuMemory(snapshot, pids) {
|
|
@@ -483,8 +1284,15 @@ function sumGpuMemory(snapshot, pids) {
|
|
|
483
1284
|
|
|
484
1285
|
module.exports = {
|
|
485
1286
|
GpuSampler,
|
|
486
|
-
|
|
1287
|
+
WindowsPdhGpuMemoryClient,
|
|
1288
|
+
NvmlGpuMemoryClient,
|
|
1289
|
+
AmdSmiGpuMemoryClient,
|
|
1290
|
+
RocmSmiGpuMemoryClient,
|
|
487
1291
|
parseMemoryToBytes,
|
|
488
|
-
|
|
1292
|
+
decodeWindowsMultiSz,
|
|
1293
|
+
extractPidFromWindowsGpuInstance,
|
|
1294
|
+
collectLinuxDrmFdinfoProcesses,
|
|
1295
|
+
isDedicatedDrmMemoryRegion,
|
|
1296
|
+
parseLinuxDrmFdinfo,
|
|
489
1297
|
sumGpuMemory
|
|
490
1298
|
}
|