sixtytwo-cli 0.3.6__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. sixtytwo/__init__.cp314-win_amd64.pyd +0 -0
  2. sixtytwo/__main__.py +5 -0
  3. sixtytwo/agent.cp314-win_amd64.pyd +0 -0
  4. sixtytwo/api.cp314-win_amd64.pyd +0 -0
  5. sixtytwo/bootstrap.cp314-win_amd64.pyd +0 -0
  6. sixtytwo/checks/__init__.cp314-win_amd64.pyd +0 -0
  7. sixtytwo/checks/_base.cp314-win_amd64.pyd +0 -0
  8. sixtytwo/checks/compute.cp314-win_amd64.pyd +0 -0
  9. sixtytwo/checks/gpu.cp314-win_amd64.pyd +0 -0
  10. sixtytwo/checks/host.cp314-win_amd64.pyd +0 -0
  11. sixtytwo/checks/interconnect.cp314-win_amd64.pyd +0 -0
  12. sixtytwo/checks/network.cp314-win_amd64.pyd +0 -0
  13. sixtytwo/checks/storage.cp314-win_amd64.pyd +0 -0
  14. sixtytwo/cli.cp314-win_amd64.pyd +0 -0
  15. sixtytwo/cloud.cp314-win_amd64.pyd +0 -0
  16. sixtytwo/collector.cp314-win_amd64.pyd +0 -0
  17. sixtytwo/config.cp314-win_amd64.pyd +0 -0
  18. sixtytwo/connect.cp314-win_amd64.pyd +0 -0
  19. sixtytwo/console.cp314-win_amd64.pyd +0 -0
  20. sixtytwo/credentials.cp314-win_amd64.pyd +0 -0
  21. sixtytwo/display.cp314-win_amd64.pyd +0 -0
  22. sixtytwo/doctor.cp314-win_amd64.pyd +0 -0
  23. sixtytwo/environment.cp314-win_amd64.pyd +0 -0
  24. sixtytwo/evidence.cp314-win_amd64.pyd +0 -0
  25. sixtytwo/formatting.cp314-win_amd64.pyd +0 -0
  26. sixtytwo/gpu_profiles.cp314-win_amd64.pyd +0 -0
  27. sixtytwo/grafana/sixtytwo-gpu-telemetry.json +579 -0
  28. sixtytwo/grafana/sixtytwo-overview.json +322 -0
  29. sixtytwo/install_agent.cp314-win_amd64.pyd +0 -0
  30. sixtytwo/integration.cp314-win_amd64.pyd +0 -0
  31. sixtytwo/intelligence.cp314-win_amd64.pyd +0 -0
  32. sixtytwo/metrics_exporter.cp314-win_amd64.pyd +0 -0
  33. sixtytwo/monitoring.cp314-win_amd64.pyd +0 -0
  34. sixtytwo/nccl_bench.cp314-win_amd64.pyd +0 -0
  35. sixtytwo/notify.cp314-win_amd64.pyd +0 -0
  36. sixtytwo/optimizer.cp314-win_amd64.pyd +0 -0
  37. sixtytwo/providers/__init__.cp314-win_amd64.pyd +0 -0
  38. sixtytwo/providers/_http.cp314-win_amd64.pyd +0 -0
  39. sixtytwo/providers/base.cp314-win_amd64.pyd +0 -0
  40. sixtytwo/providers/generic.cp314-win_amd64.pyd +0 -0
  41. sixtytwo/providers/lambda_labs.cp314-win_amd64.pyd +0 -0
  42. sixtytwo/providers/region.cp314-win_amd64.pyd +0 -0
  43. sixtytwo/providers/runpod.cp314-win_amd64.pyd +0 -0
  44. sixtytwo/providers/shadeform.cp314-win_amd64.pyd +0 -0
  45. sixtytwo/providers/skypilot.cp314-win_amd64.pyd +0 -0
  46. sixtytwo/providers/slurm.cp314-win_amd64.pyd +0 -0
  47. sixtytwo/providers/ssh_cluster.cp314-win_amd64.pyd +0 -0
  48. sixtytwo/providers/vast.cp314-win_amd64.pyd +0 -0
  49. sixtytwo/recovery.cp314-win_amd64.pyd +0 -0
  50. sixtytwo/registry.cp314-win_amd64.pyd +0 -0
  51. sixtytwo/reports.cp314-win_amd64.pyd +0 -0
  52. sixtytwo/simple_yaml.cp314-win_amd64.pyd +0 -0
  53. sixtytwo/slurm.cp314-win_amd64.pyd +0 -0
  54. sixtytwo/testing.cp314-win_amd64.pyd +0 -0
  55. sixtytwo/topology.cp314-win_amd64.pyd +0 -0
  56. sixtytwo/wandb_integration.cp314-win_amd64.pyd +0 -0
  57. sixtytwo/web_analytics.cp314-win_amd64.pyd +0 -0
  58. sixtytwo_cli/__init__.cp314-win_amd64.pyd +0 -0
  59. sixtytwo_cli/main.cp314-win_amd64.pyd +0 -0
  60. sixtytwo_cli-0.3.6.dist-info/METADATA +175 -0
  61. sixtytwo_cli-0.3.6.dist-info/RECORD +78 -0
  62. sixtytwo_cli-0.3.6.dist-info/WHEEL +5 -0
  63. sixtytwo_cli-0.3.6.dist-info/entry_points.txt +2 -0
  64. sixtytwo_cli-0.3.6.dist-info/licenses/LICENSE +73 -0
  65. sixtytwo_cli-0.3.6.dist-info/top_level.txt +3 -0
  66. sixtytwo_skypilot/__init__.cp314-win_amd64.pyd +0 -0
  67. sixtytwo_skypilot/catalog.cp314-win_amd64.pyd +0 -0
  68. sixtytwo_skypilot/catalog_patches.cp314-win_amd64.pyd +0 -0
  69. sixtytwo_skypilot/client.cp314-win_amd64.pyd +0 -0
  70. sixtytwo_skypilot/cloud.cp314-win_amd64.pyd +0 -0
  71. sixtytwo_skypilot/cluster_store.cp314-win_amd64.pyd +0 -0
  72. sixtytwo_skypilot/credentials.cp314-win_amd64.pyd +0 -0
  73. sixtytwo_skypilot/provisioner.cp314-win_amd64.pyd +0 -0
  74. sixtytwo_skypilot/sky_catalog_module.cp314-win_amd64.pyd +0 -0
  75. sixtytwo_skypilot/status.cp314-win_amd64.pyd +0 -0
  76. sixtytwo_skypilot/template_installer.cp314-win_amd64.pyd +0 -0
  77. sixtytwo_skypilot/templates/__init__.cp314-win_amd64.pyd +0 -0
  78. sixtytwo_skypilot/templates/sixtytwo-ray.yml.j2 +59 -0
Binary file
sixtytwo/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .cli import unified_main as main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,579 @@
1
+ {
2
+ "annotations": {"list": []},
3
+ "editable": true,
4
+ "fiscalYearStartMonth": 0,
5
+ "graphTooltip": 1,
6
+ "schemaVersion": 39,
7
+ "tags": ["sixtytwo", "gpu", "telemetry", "hardware"],
8
+ "time": {"from": "now-1h", "to": "now"},
9
+ "timepicker": {},
10
+ "timezone": "",
11
+ "title": "Sixtytwo // GPU Hardware Telemetry",
12
+ "uid": "sixtytwo-gpu-telemetry",
13
+ "version": 1,
14
+ "templating": {
15
+ "list": [
16
+ {
17
+ "name": "datasource",
18
+ "type": "datasource",
19
+ "query": "prometheus",
20
+ "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
21
+ "hide": 0
22
+ },
23
+ {
24
+ "name": "node",
25
+ "type": "query",
26
+ "label": "Node",
27
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
28
+ "query": "label_values(sixtytwo_gpu_up, node_id)",
29
+ "refresh": 2,
30
+ "includeAll": true,
31
+ "multi": true,
32
+ "current": {"selected": true, "text": "All", "value": "$__all"}
33
+ },
34
+ {
35
+ "name": "vendor",
36
+ "type": "query",
37
+ "label": "Vendor",
38
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
39
+ "query": "label_values(sixtytwo_gpu_up{node_id=~\"$node\"}, vendor)",
40
+ "refresh": 2,
41
+ "includeAll": true,
42
+ "multi": true,
43
+ "current": {"selected": true, "text": "All", "value": "$__all"}
44
+ }
45
+ ]
46
+ },
47
+ "panels": [
48
+ {
49
+ "type": "row",
50
+ "title": "Fleet Overview",
51
+ "id": 100,
52
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
53
+ "collapsed": false
54
+ },
55
+ {
56
+ "type": "stat",
57
+ "title": "Total GPUs Online",
58
+ "id": 1,
59
+ "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
60
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
61
+ "targets": [
62
+ {"expr": "count(sixtytwo_gpu_up{node_id=~\"$node\",vendor=~\"$vendor\"} == 1)", "refId": "A", "legendFormat": ""}
63
+ ],
64
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "none"},
65
+ "fieldConfig": {"defaults": {"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}
66
+ },
67
+ {
68
+ "type": "stat",
69
+ "title": "GPUs Unreachable",
70
+ "id": 2,
71
+ "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
72
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
73
+ "targets": [
74
+ {"expr": "count(sixtytwo_gpu_up{node_id=~\"$node\",vendor=~\"$vendor\"} == 0) or vector(0)", "refId": "A", "legendFormat": ""}
75
+ ],
76
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "none"},
77
+ "fieldConfig": {
78
+ "defaults": {
79
+ "thresholds": {"mode": "absolute", "steps": [
80
+ {"color": "green", "value": null},
81
+ {"color": "red", "value": 1}
82
+ ]}
83
+ }
84
+ }
85
+ },
86
+ {
87
+ "type": "stat",
88
+ "title": "Avg GPU Utilization",
89
+ "id": 3,
90
+ "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
91
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
92
+ "targets": [
93
+ {"expr": "avg(sixtytwo_gpu_utilization_ratio{node_id=~\"$node\",vendor=~\"$vendor\"})", "refId": "A"}
94
+ ],
95
+ "fieldConfig": {
96
+ "defaults": {
97
+ "unit": "percentunit",
98
+ "min": 0, "max": 1,
99
+ "thresholds": {"mode": "absolute", "steps": [
100
+ {"color": "red", "value": null},
101
+ {"color": "yellow", "value": 0.5},
102
+ {"color": "green", "value": 0.8}
103
+ ]}
104
+ }
105
+ },
106
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "area"}
107
+ },
108
+ {
109
+ "type": "stat",
110
+ "title": "Nodes w/ ECC Uncorrected Errors",
111
+ "id": 4,
112
+ "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
113
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
114
+ "targets": [
115
+ {
116
+ "expr": "count(increase(sixtytwo_gpu_ecc_errors_uncorrected_volatile_total{node_id=~\"$node\"}[10m]) > 0) or vector(0)",
117
+ "refId": "A",
118
+ "legendFormat": ""
119
+ }
120
+ ],
121
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "none"},
122
+ "fieldConfig": {
123
+ "defaults": {
124
+ "thresholds": {"mode": "absolute", "steps": [
125
+ {"color": "green", "value": null},
126
+ {"color": "red", "value": 1}
127
+ ]}
128
+ }
129
+ }
130
+ },
131
+ {
132
+ "type": "stat",
133
+ "title": "Max GPU Temperature",
134
+ "id": 5,
135
+ "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
136
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
137
+ "targets": [
138
+ {"expr": "max(sixtytwo_gpu_temperature_celsius{node_id=~\"$node\",vendor=~\"$vendor\"})", "refId": "A"}
139
+ ],
140
+ "fieldConfig": {
141
+ "defaults": {
142
+ "unit": "celsius",
143
+ "thresholds": {"mode": "absolute", "steps": [
144
+ {"color": "green", "value": null},
145
+ {"color": "yellow", "value": 75},
146
+ {"color": "red", "value": 85}
147
+ ]}
148
+ }
149
+ },
150
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "area"}
151
+ },
152
+ {
153
+ "type": "stat",
154
+ "title": "Total Fleet Power Draw",
155
+ "id": 6,
156
+ "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
157
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
158
+ "targets": [
159
+ {"expr": "sum(sixtytwo_gpu_power_watts{node_id=~\"$node\",vendor=~\"$vendor\"})", "refId": "A"}
160
+ ],
161
+ "fieldConfig": {"defaults": {"unit": "watt"}},
162
+ "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "graphMode": "area"}
163
+ },
164
+ {
165
+ "type": "row",
166
+ "title": "Utilization",
167
+ "id": 200,
168
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
169
+ "collapsed": false
170
+ },
171
+ {
172
+ "type": "timeseries",
173
+ "title": "GPU Compute Utilization",
174
+ "id": 10,
175
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
176
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
177
+ "targets": [
178
+ {
179
+ "expr": "sixtytwo_gpu_utilization_ratio{node_id=~\"$node\",vendor=~\"$vendor\"}",
180
+ "refId": "A",
181
+ "legendFormat": "{{node_id}} GPU{{index}} ({{name}})"
182
+ }
183
+ ],
184
+ "fieldConfig": {
185
+ "defaults": {
186
+ "unit": "percentunit",
187
+ "min": 0, "max": 1,
188
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10}
189
+ }
190
+ },
191
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
192
+ },
193
+ {
194
+ "type": "timeseries",
195
+ "title": "GPU Memory Utilization",
196
+ "id": 11,
197
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
198
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
199
+ "targets": [
200
+ {
201
+ "expr": "sixtytwo_gpu_memory_utilization_ratio{node_id=~\"$node\",vendor=~\"$vendor\"}",
202
+ "refId": "A",
203
+ "legendFormat": "{{node_id}} GPU{{index}}"
204
+ }
205
+ ],
206
+ "fieldConfig": {
207
+ "defaults": {
208
+ "unit": "percentunit",
209
+ "min": 0, "max": 1,
210
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10}
211
+ }
212
+ },
213
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
214
+ },
215
+ {
216
+ "type": "row",
217
+ "title": "Memory",
218
+ "id": 300,
219
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 14},
220
+ "collapsed": false
221
+ },
222
+ {
223
+ "type": "timeseries",
224
+ "title": "GPU Memory Used",
225
+ "id": 20,
226
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
227
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
228
+ "targets": [
229
+ {
230
+ "expr": "sixtytwo_gpu_memory_used_bytes{node_id=~\"$node\",vendor=~\"$vendor\"}",
231
+ "refId": "A",
232
+ "legendFormat": "{{node_id}} GPU{{index}}"
233
+ }
234
+ ],
235
+ "fieldConfig": {
236
+ "defaults": {
237
+ "unit": "bytes",
238
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10}
239
+ }
240
+ },
241
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
242
+ },
243
+ {
244
+ "type": "timeseries",
245
+ "title": "GPU Memory Used (% of total)",
246
+ "id": 21,
247
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
248
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
249
+ "targets": [
250
+ {
251
+ "expr": "sixtytwo_gpu_memory_used_bytes{node_id=~\"$node\",vendor=~\"$vendor\"} / sixtytwo_gpu_memory_total_bytes{node_id=~\"$node\",vendor=~\"$vendor\"}",
252
+ "refId": "A",
253
+ "legendFormat": "{{node_id}} GPU{{index}}"
254
+ }
255
+ ],
256
+ "fieldConfig": {
257
+ "defaults": {
258
+ "unit": "percentunit",
259
+ "min": 0, "max": 1,
260
+ "thresholds": {"mode": "absolute", "steps": [
261
+ {"color": "green", "value": null},
262
+ {"color": "yellow", "value": 0.85},
263
+ {"color": "red", "value": 0.95}
264
+ ]},
265
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10}
266
+ }
267
+ },
268
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
269
+ },
270
+ {
271
+ "type": "row",
272
+ "title": "Thermals & Power",
273
+ "id": 400,
274
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 23},
275
+ "collapsed": false
276
+ },
277
+ {
278
+ "type": "timeseries",
279
+ "title": "GPU Temperature",
280
+ "id": 30,
281
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
282
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
283
+ "targets": [
284
+ {
285
+ "expr": "sixtytwo_gpu_temperature_celsius{node_id=~\"$node\",vendor=~\"$vendor\"}",
286
+ "refId": "A",
287
+ "legendFormat": "{{node_id}} GPU{{index}}"
288
+ }
289
+ ],
290
+ "fieldConfig": {
291
+ "defaults": {
292
+ "unit": "celsius",
293
+ "thresholds": {"mode": "absolute", "steps": [
294
+ {"color": "green", "value": null},
295
+ {"color": "yellow", "value": 75},
296
+ {"color": "red", "value": 85}
297
+ ]},
298
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10, "thresholdsStyle": {"mode": "line"}}
299
+ }
300
+ },
301
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
302
+ },
303
+ {
304
+ "type": "timeseries",
305
+ "title": "GPU Power Draw vs Limit",
306
+ "id": 31,
307
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
308
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
309
+ "targets": [
310
+ {
311
+ "expr": "sixtytwo_gpu_power_watts{node_id=~\"$node\",vendor=~\"$vendor\"}",
312
+ "refId": "A",
313
+ "legendFormat": "{{node_id}} GPU{{index}} draw"
314
+ },
315
+ {
316
+ "expr": "sixtytwo_gpu_power_limit_watts{node_id=~\"$node\",vendor=~\"$vendor\"}",
317
+ "refId": "B",
318
+ "legendFormat": "{{node_id}} GPU{{index}} limit"
319
+ }
320
+ ],
321
+ "fieldConfig": {
322
+ "defaults": {
323
+ "unit": "watt",
324
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 5}
325
+ },
326
+ "overrides": [
327
+ {
328
+ "matcher": {"id": "byFrameRefID", "options": "B"},
329
+ "properties": [
330
+ {"id": "custom.lineStyle", "value": {"dash": [10, 10], "fill": "dash"}},
331
+ {"id": "custom.fillOpacity", "value": 0}
332
+ ]
333
+ }
334
+ ]
335
+ },
336
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
337
+ },
338
+ {
339
+ "type": "timeseries",
340
+ "title": "SM & Memory Clock Frequencies",
341
+ "id": 32,
342
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 32},
343
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
344
+ "targets": [
345
+ {
346
+ "expr": "sixtytwo_gpu_sm_clock_mhz{node_id=~\"$node\",vendor=~\"$vendor\"}",
347
+ "refId": "A",
348
+ "legendFormat": "{{node_id}} GPU{{index}} SM"
349
+ },
350
+ {
351
+ "expr": "sixtytwo_gpu_mem_clock_mhz{node_id=~\"$node\",vendor=~\"$vendor\"}",
352
+ "refId": "B",
353
+ "legendFormat": "{{node_id}} GPU{{index}} mem"
354
+ }
355
+ ],
356
+ "fieldConfig": {
357
+ "defaults": {
358
+ "unit": "megahertz",
359
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 5}
360
+ }
361
+ },
362
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
363
+ },
364
+ {
365
+ "type": "timeseries",
366
+ "title": "Fan Speed",
367
+ "id": 33,
368
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 32},
369
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
370
+ "targets": [
371
+ {
372
+ "expr": "sixtytwo_gpu_fan_speed_ratio{node_id=~\"$node\",vendor=~\"$vendor\"}",
373
+ "refId": "A",
374
+ "legendFormat": "{{node_id}} GPU{{index}}"
375
+ }
376
+ ],
377
+ "fieldConfig": {
378
+ "defaults": {
379
+ "unit": "percentunit",
380
+ "min": 0, "max": 1,
381
+ "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 5}
382
+ }
383
+ },
384
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
385
+ },
386
+ {
387
+ "type": "row",
388
+ "title": "ECC Errors",
389
+ "id": 500,
390
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 40},
391
+ "collapsed": false
392
+ },
393
+ {
394
+ "type": "timeseries",
395
+ "title": "Uncorrectable ECC Errors (volatile, rate)",
396
+ "id": 40,
397
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 41},
398
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
399
+ "targets": [
400
+ {
401
+ "expr": "increase(sixtytwo_gpu_ecc_errors_uncorrected_volatile_total{node_id=~\"$node\",vendor=~\"$vendor\"}[5m])",
402
+ "refId": "A",
403
+ "legendFormat": "{{node_id}} GPU{{index}} DBE"
404
+ }
405
+ ],
406
+ "fieldConfig": {
407
+ "defaults": {
408
+ "unit": "short",
409
+ "thresholds": {"mode": "absolute", "steps": [
410
+ {"color": "green", "value": null},
411
+ {"color": "red", "value": 1}
412
+ ]},
413
+ "custom": {
414
+ "drawStyle": "bars",
415
+ "lineWidth": 1,
416
+ "fillOpacity": 80,
417
+ "thresholdsStyle": {"mode": "area"}
418
+ }
419
+ }
420
+ },
421
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
422
+ },
423
+ {
424
+ "type": "timeseries",
425
+ "title": "Correctable ECC Errors (volatile, rate)",
426
+ "id": 41,
427
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
428
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
429
+ "targets": [
430
+ {
431
+ "expr": "increase(sixtytwo_gpu_ecc_errors_corrected_volatile_total{node_id=~\"$node\",vendor=~\"$vendor\"}[5m])",
432
+ "refId": "A",
433
+ "legendFormat": "{{node_id}} GPU{{index}} SBE"
434
+ }
435
+ ],
436
+ "fieldConfig": {
437
+ "defaults": {
438
+ "unit": "short",
439
+ "custom": {"drawStyle": "bars", "lineWidth": 1, "fillOpacity": 60}
440
+ }
441
+ },
442
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
443
+ },
444
+ {
445
+ "type": "table",
446
+ "title": "Aggregate ECC Error Totals (lifetime)",
447
+ "id": 42,
448
+ "gridPos": {"h": 8, "w": 24, "x": 0, "y": 49},
449
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
450
+ "targets": [
451
+ {
452
+ "expr": "sixtytwo_gpu_ecc_errors_corrected_aggregate_total{node_id=~\"$node\",vendor=~\"$vendor\"}",
453
+ "refId": "A",
454
+ "instant": true,
455
+ "format": "table",
456
+ "legendFormat": "corrected"
457
+ },
458
+ {
459
+ "expr": "sixtytwo_gpu_ecc_errors_uncorrected_aggregate_total{node_id=~\"$node\",vendor=~\"$vendor\"}",
460
+ "refId": "B",
461
+ "instant": true,
462
+ "format": "table",
463
+ "legendFormat": "uncorrected"
464
+ }
465
+ ],
466
+ "transformations": [
467
+ {"id": "merge", "options": {}},
468
+ {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true, "instance": true}}}
469
+ ]
470
+ },
471
+ {
472
+ "type": "row",
473
+ "title": "PCIe",
474
+ "id": 600,
475
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 57},
476
+ "collapsed": false
477
+ },
478
+ {
479
+ "type": "timeseries",
480
+ "title": "PCIe Link Generation",
481
+ "id": 50,
482
+ "gridPos": {"h": 6, "w": 12, "x": 0, "y": 58},
483
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
484
+ "targets": [
485
+ {
486
+ "expr": "sixtytwo_gpu_pcie_link_gen{node_id=~\"$node\",vendor=~\"$vendor\"}",
487
+ "refId": "A",
488
+ "legendFormat": "{{node_id}} GPU{{index}}"
489
+ }
490
+ ],
491
+ "fieldConfig": {
492
+ "defaults": {
493
+ "unit": "short",
494
+ "min": 0,
495
+ "thresholds": {"mode": "absolute", "steps": [
496
+ {"color": "red", "value": null},
497
+ {"color": "yellow", "value": 4},
498
+ {"color": "green", "value": 5}
499
+ ]},
500
+ "custom": {"drawStyle": "line", "lineWidth": 1, "thresholdsStyle": {"mode": "line"}}
501
+ }
502
+ },
503
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
504
+ },
505
+ {
506
+ "type": "timeseries",
507
+ "title": "PCIe Link Width (lanes)",
508
+ "id": 51,
509
+ "gridPos": {"h": 6, "w": 12, "x": 12, "y": 58},
510
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
511
+ "targets": [
512
+ {
513
+ "expr": "sixtytwo_gpu_pcie_link_width{node_id=~\"$node\",vendor=~\"$vendor\"}",
514
+ "refId": "A",
515
+ "legendFormat": "{{node_id}} GPU{{index}}"
516
+ }
517
+ ],
518
+ "fieldConfig": {
519
+ "defaults": {
520
+ "unit": "short",
521
+ "min": 0,
522
+ "thresholds": {"mode": "absolute", "steps": [
523
+ {"color": "red", "value": null},
524
+ {"color": "yellow", "value": 8},
525
+ {"color": "green", "value": 16}
526
+ ]},
527
+ "custom": {"drawStyle": "line", "lineWidth": 1, "thresholdsStyle": {"mode": "line"}}
528
+ }
529
+ },
530
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
531
+ },
532
+ {
533
+ "type": "row",
534
+ "title": "Runtime Monitoring Events",
535
+ "id": 700,
536
+ "gridPos": {"h": 1, "w": 24, "x": 0, "y": 64},
537
+ "collapsed": false
538
+ },
539
+ {
540
+ "type": "timeseries",
541
+ "title": "Monitoring Event Rate by Type",
542
+ "id": 60,
543
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 65},
544
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
545
+ "targets": [
546
+ {
547
+ "expr": "sum by (event_type) (increase(sixtytwo_monitoring_events_total{node_id=~\"$node\"}[30m]))",
548
+ "refId": "A",
549
+ "legendFormat": "{{event_type}}"
550
+ }
551
+ ],
552
+ "fieldConfig": {
553
+ "defaults": {
554
+ "unit": "short",
555
+ "custom": {"drawStyle": "bars", "lineWidth": 1, "fillOpacity": 70, "stacking": {"mode": "value"}}
556
+ }
557
+ },
558
+ "options": {"legend": {"displayMode": "table", "placement": "bottom"}}
559
+ },
560
+ {
561
+ "type": "table",
562
+ "title": "Monitoring Event Counts by Node",
563
+ "id": 61,
564
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 65},
565
+ "datasource": {"type": "prometheus", "uid": "${datasource}"},
566
+ "targets": [
567
+ {
568
+ "expr": "sum by (node_id, event_type, severity) (sixtytwo_monitoring_events_total{node_id=~\"$node\"})",
569
+ "refId": "A",
570
+ "instant": true,
571
+ "format": "table"
572
+ }
573
+ ],
574
+ "transformations": [
575
+ {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true, "instance": true}}}
576
+ ]
577
+ }
578
+ ]
579
+ }