ob-metaflow-extensions 1.1.42rc1__tar.gz → 1.1.43__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (24) hide show
  1. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/__init__.py +0 -7
  3. ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/__init__.py +1 -0
  4. ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/gpu.py +701 -0
  5. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  6. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/SOURCES.txt +2 -1
  7. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/setup.py +1 -1
  8. ob-metaflow-extensions-1.1.42rc1/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -27
  9. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/README.md +0 -0
  10. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  11. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  12. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  13. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  14. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  15. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  16. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  17. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  18. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  19. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  20. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  21. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  22. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  23. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  24. {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.42rc1
3
+ Version: 1.1.43
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -2,13 +2,6 @@ import metaflow.metaflow_config_funcs
2
2
 
3
3
  from metaflow_extensions.outerbounds.remote_config import init_config
4
4
 
5
- from metaflow_extensions.outerbounds.plugins.perimeters import (
6
- override_metaflow_profile_with_perimeter,
7
- )
8
-
9
- override_metaflow_profile_with_perimeter()
10
-
11
-
12
5
  # we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
13
6
  # we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
14
7
  # users still have the power to overriden them with environment variables
@@ -0,0 +1 @@
1
+ from .gpu import gpu_profile
@@ -0,0 +1,701 @@
1
+ from metaflow.cards import Markdown, Table, VegaChart
2
+ from functools import wraps
3
+ import threading
4
+ from datetime import datetime
5
+ from metaflow import current
6
+ from metaflow.cards import Table, Markdown, VegaChart, Image
7
+ import time
8
+ from typing import List, Dict, Any, Union
9
+ import re
10
+ import os
11
+ import uuid
12
+ import json
13
+ import sys
14
+ from tempfile import TemporaryDirectory
15
+ from subprocess import check_output, Popen
16
+ from datetime import datetime, timedelta
17
+ from functools import wraps
18
+ from collections import namedtuple
19
+
20
+ # Card plot styles
21
+ MEM_COLOR = "#0c64d6"
22
+ GPU_COLOR = "#ff69b4"
23
+
24
+ NVIDIA_TS_FORMAT = "%Y/%m/%d %H:%M:%S"
25
+
26
+
27
+ DRIVER_VER = re.compile(b"Driver Version: (.+?) ")
28
+ CUDA_VER = re.compile(b"CUDA Version:(.*) ")
29
+
30
+ MONITOR_FIELDS = [
31
+ "timestamp",
32
+ "gpu_utilization",
33
+ "memory_used",
34
+ "memory_total",
35
+ ]
36
+
37
+ MONITOR = """nvidia-smi --query-gpu=pci.bus_id,timestamp,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits -l {interval};"""
38
+ ProcessUUID = namedtuple("ProcessUUID", ["uuid", "start_time", "end_time"])
39
+
40
+
41
+ def _get_uuid(time_duration=600):
42
+ frmt_str = "%Y-%m-%d-%H-%M-%S"
43
+ # Create a datetime range between the timerange values using current date as start date and time_duration as end date
44
+ start_date = datetime.now()
45
+ end_date = start_date + timedelta(seconds=time_duration)
46
+ datetime_range = start_date = (
47
+ datetime.now().strftime(frmt_str) + "_" + end_date.strftime(frmt_str)
48
+ )
49
+ uuid_str = uuid.uuid4().hex.replace("-", "") + "_" + datetime_range
50
+ return ProcessUUID(uuid_str, start_date, end_date)
51
+
52
+
53
+ class AsyncProcessManager:
54
+ """
55
+ This class is responsible for managing the nvidia SMI subprocesses
56
+ """
57
+
58
+ processes: Dict[str, Dict] = {
59
+ # "procid": {
60
+ # "proc": subprocess.Popen,
61
+ # "started": time.time()
62
+ # }
63
+ }
64
+
65
+ @classmethod
66
+ def _register_process(cls, procid, proc):
67
+ cls.processes[procid] = {
68
+ "proc": proc,
69
+ "started": time.time(),
70
+ }
71
+
72
+ @classmethod
73
+ def get(cls, procid):
74
+ proc_dict = cls.processes.get(procid, None)
75
+ if proc_dict is not None:
76
+ return proc_dict["proc"], proc_dict["started"]
77
+ return None, None
78
+
79
+ @classmethod
80
+ def spawn(cls, procid, cmd, file):
81
+ proc = Popen(cmd, stdout=file)
82
+ cls._register_process(procid, proc)
83
+
84
+ @classmethod
85
+ def remove(cls, procid, delete_item=True):
86
+ if procid in cls.processes:
87
+ if cls.processes[procid]["proc"].stdout is not None:
88
+ cls.processes[procid]["proc"].stdout.close()
89
+ cls.processes[procid]["proc"].terminate()
90
+ cls.processes[procid]["proc"].wait()
91
+ if delete_item:
92
+ del cls.processes[procid]
93
+
94
+ @classmethod
95
+ def cleanup(cls):
96
+ for procid in cls.processes:
97
+ cls.remove(procid, delete_item=False)
98
+ cls.processes.clear()
99
+
100
+ @classmethod
101
+ def is_running(cls, procid):
102
+ if procid not in cls.processes:
103
+ return False
104
+ return cls.processes[procid]["proc"].poll() is None
105
+
106
+
107
+ def _parse_timestamp(timestamp):
108
+ try:
109
+ ts = timestamp.split(".")[0]
110
+ return datetime.strptime(ts, NVIDIA_TS_FORMAT)
111
+ except ValueError:
112
+ return None
113
+
114
+
115
+ class GPUMonitor:
116
+ """
117
+ The `GPUMonitor` class is designed to monitor GPU usage.
118
+
119
+ When an instance of `GPUMonitor` is created, it initializes with a specified `interval` and `duration`.
120
+ The `duration` is the timeperiod it will run the NVIDIA SMI command for and the `interval` is the timeperiod between each reading.
121
+ The class exposes a `_monitor_update_thread` method which runs as a background thread that continuously updates the GPU usage readings.
122
+ It will keep running unitl the `_finished` flag is set to `True`.
123
+
124
+ The class will statefully manage the the spawned NVIDI-SMI processes.
125
+ It will start a new NVIDI-SMI process after the current one has ran for the specified `duration`.
126
+ At a time this class will only maintain readings for the `_current_process` and will have all the aggregated
127
+ readings for the past processes stored in the `_past_readings` dictionary.
128
+ When a process finishes completion, the readings are appended to the `_past_readings` dictionary and a new process is started.
129
+
130
+ If the caller of this class wishes to read the GPU usage, they can call the `read` method which will return the readings in a dictionary format.
131
+ The `read` method will aggregate the readings from the `_current_readings` and `_past_readings`.
132
+ """
133
+
134
+ _started_processes: List[ProcessUUID] = []
135
+
136
+ _current_process: Union[ProcessUUID, None] = None
137
+
138
+ _current_readings: Dict[str, Any] = {}
139
+
140
+ _past_readings: Dict[str, Any] = {}
141
+
142
+ def __init__(self, interval=1, duration=300) -> None:
143
+ self._tempdir = TemporaryDirectory(prefix="gpu_card_monitor", dir="./")
144
+ self._interval = interval
145
+ self._duration = duration
146
+ self._finished = False
147
+
148
+ @property
149
+ def _current_file(self):
150
+ if self._current_process is None:
151
+ return None
152
+ return os.path.join(self._tempdir.name, self._current_process.uuid + ".csv")
153
+
154
+ def get_file_name(self, uuid):
155
+ return os.path.join(self._tempdir.name, uuid + ".csv")
156
+
157
+ def create_new_monitor(self):
158
+ uuid = _get_uuid(self._duration)
159
+ file = open(self.get_file_name(uuid.uuid), "w")
160
+ cmd = MONITOR.format(interval=self._interval, time_duration=self._duration)
161
+ AsyncProcessManager.spawn(uuid.uuid, ["bash", "-c", cmd], file)
162
+ self._started_processes.append(uuid)
163
+ self._current_process = uuid
164
+ return uuid
165
+
166
+ def clear_current_monitor(self):
167
+ if self._current_process is None:
168
+ return
169
+ AsyncProcessManager.remove(self._current_process.uuid)
170
+ self._current_process = None
171
+
172
+ def current_process_has_ended(self):
173
+ if self._current_process is None:
174
+ return True
175
+ return datetime.now() > self._current_process.end_time
176
+
177
+ def current_process_is_running(self):
178
+ if self._current_process is None:
179
+ return False
180
+ return AsyncProcessManager.is_running(self._current_process.uuid)
181
+
182
+ def _read_monitor(self):
183
+ """
184
+ Reads the monitor file and returns the readings in a dictionary format
185
+ """
186
+ all_readings = []
187
+ if self._current_file is None:
188
+ return None
189
+ # Extract everything from the CVS File and store it in a list of dictionaries
190
+ all_fields = ["gpu_id"] + MONITOR_FIELDS
191
+ with open(self._current_file, "r") as _monitor_out:
192
+ for line in _monitor_out.readlines():
193
+ data = {}
194
+ fields = [f.strip() for f in line.split(",")]
195
+ if len(fields) == len(all_fields):
196
+ # strip subsecond resolution from timestamps that doesn't align across devices
197
+ for idx, _f in enumerate(all_fields):
198
+ data[_f] = fields[idx]
199
+ all_readings.append(data)
200
+ else:
201
+ # expect that the last line may be truncated
202
+ break
203
+
204
+ # Convert to dictionary format
205
+ devdata = {}
206
+ for reading in all_readings:
207
+ gpu_id = reading["gpu_id"]
208
+ if "timestamp" not in reading:
209
+ continue
210
+ if _parse_timestamp(reading["timestamp"]) is None:
211
+ continue
212
+ reading["timestamp"] = reading["timestamp"].split(".")[0]
213
+ if gpu_id not in devdata:
214
+ devdata[gpu_id] = {}
215
+
216
+ for i, field in enumerate(MONITOR_FIELDS):
217
+ if field not in devdata[gpu_id]:
218
+ devdata[gpu_id][field] = []
219
+ devdata[gpu_id][field].append(reading[field])
220
+ return devdata
221
+
222
+ def _update_readings(self):
223
+ """
224
+ Core update function that checks if the current process has ended and if so, it will create a new monitor
225
+ otherwise sets the current readings to the readings from the monitor file
226
+ """
227
+ if self.current_process_has_ended() or not self.current_process_is_running():
228
+ self._update_past_readings()
229
+ self.clear_current_monitor()
230
+ self.create_new_monitor()
231
+ # Sleep for 1 seconds to allow the new process to start and we can make a reading
232
+ time.sleep(1)
233
+
234
+ readings = self._read_monitor()
235
+ if readings is None:
236
+ return
237
+ self._current_readings = readings
238
+
239
+ @staticmethod
240
+ def _make_full_reading(current, past):
241
+ if current is None:
242
+ return past
243
+ for gpu_id in current:
244
+ if gpu_id not in past:
245
+ past[gpu_id] = {}
246
+ for field in MONITOR_FIELDS:
247
+ if field not in past[gpu_id]:
248
+ past[gpu_id][field] = []
249
+ past[gpu_id][field].extend(current[gpu_id][field])
250
+ return past
251
+
252
+ def read(self):
253
+ return self._make_full_reading(
254
+ self._current_readings, json.loads(json.dumps(self._past_readings))
255
+ )
256
+
257
+ def _update_past_readings(self):
258
+ if self._current_readings is None:
259
+ return
260
+ self._past_readings = self._make_full_reading(
261
+ self._current_readings, json.loads(json.dumps(self._past_readings))
262
+ )
263
+ self._current_readings = None
264
+
265
+ def cleanup(self):
266
+ self._finished = True
267
+ AsyncProcessManager.cleanup()
268
+ self._tempdir.cleanup()
269
+
270
+ def _monitor_update_thread(self):
271
+ while not self._finished:
272
+ self._update_readings()
273
+ time.sleep(self._interval)
274
+
275
+
276
+ def _get_ts_range(_range):
277
+ if _range == "":
278
+ return "*No readings available*"
279
+ return "*Time range of charts: %s*" % _range
280
+
281
+
282
+ def _update_utilization(results, md_dict):
283
+ for device, data in results["profile"].items():
284
+ if device not in md_dict:
285
+ print(
286
+ "Device %s not found in the GPU card layout. Skipping..." % device,
287
+ file=sys.stderr,
288
+ )
289
+ continue
290
+ md_dict[device]["gpu"].update(
291
+ "%2.1f%%" % max(map(float, data["gpu_utilization"]))
292
+ )
293
+ md_dict[device]["memory"].update("%dMB" % max(map(float, data["memory_used"])))
294
+
295
+
296
+ def _update_charts(results, md_dict):
297
+ for device, data in results["profile"].items():
298
+ try:
299
+ if device not in md_dict:
300
+ continue
301
+ gpu_plot, mem_plot, ts_range = profile_plots(
302
+ device,
303
+ data["timestamp"],
304
+ data["gpu_utilization"],
305
+ data["memory_used"],
306
+ data["memory_total"],
307
+ )
308
+ md_dict[device]["gpu"].update(gpu_plot)
309
+ md_dict[device]["memory"].update(mem_plot)
310
+ md_dict[device]["reading_duration"].update(_get_ts_range(ts_range))
311
+ except ValueError as e:
312
+ # This is thrown when the date is unparsable. We can just safely ignore this.
313
+ print("ValueError: Could not parse date \n%s" % str(e), file=sys.stderr)
314
+
315
+
316
+ # This code is adapted from: https://github.com/outerbounds/monitorbench
317
+ class GPUProfiler:
318
+ def __init__(self, interval=1, monitor_batch_duration=200):
319
+ self.driver_ver, self.cuda_ver, self.error = self._read_versions()
320
+ (
321
+ self.interconnect_data,
322
+ self.interconnect_legend,
323
+ ) = self._read_multi_gpu_interconnect()
324
+ if self.error:
325
+ self.devices = []
326
+ return
327
+ else:
328
+ self.devices = self._read_devices()
329
+ self._monitor = GPUMonitor(
330
+ interval=interval, duration=monitor_batch_duration
331
+ )
332
+ self._monitor_thread = threading.Thread(
333
+ target=self._monitor._monitor_update_thread, daemon=True
334
+ )
335
+ self._monitor_thread.start()
336
+ self._interval = interval
337
+
338
+ self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
339
+ self._card_created = False
340
+
341
+ def finish(self):
342
+ ret = {
343
+ "error": self.error,
344
+ "cuda_version": self.cuda_ver,
345
+ "driver_version": self.driver_ver,
346
+ }
347
+ if self.error:
348
+ return ret
349
+ else:
350
+ ret["devices"] = self.devices
351
+ ret["profile"] = self._monitor.read()
352
+ ret["interconnect"] = {
353
+ "data": self.interconnect_data,
354
+ "legend": self.interconnect_legend,
355
+ }
356
+ self._monitor.cleanup()
357
+ return ret
358
+
359
+ def _make_reading(self):
360
+ ret = {
361
+ "error": self.error,
362
+ "cuda_version": self.cuda_ver,
363
+ "driver_version": self.driver_ver,
364
+ }
365
+ if self.error:
366
+ return ret
367
+ else:
368
+ ret["devices"] = self.devices
369
+ ret["profile"] = self._monitor.read()
370
+ ret["interconnect"] = {
371
+ "data": self.interconnect_data,
372
+ "legend": self.interconnect_legend,
373
+ }
374
+ return ret
375
+
376
+ def _update_card(self):
377
+ if len(self.devices) == 0:
378
+ current.card["gpu_profile"].clear()
379
+ current.card["gpu_profile"].append(
380
+ Markdown("## GPU profile failed: %s" % self.error)
381
+ )
382
+ current.card["gpu_profile"].refresh()
383
+
384
+ return
385
+
386
+ while True:
387
+ readings = self._make_reading()
388
+ if readings is None:
389
+ print("GPU Profiler readings are none", file=sys.stderr)
390
+ time.sleep(self._interval)
391
+ continue
392
+ _update_utilization(readings, self._card_comps["max_utilization"])
393
+ _update_charts(readings, self._card_comps["charts"])
394
+ current.card["gpu_profile"].refresh()
395
+ time.sleep(self._interval)
396
+
397
+ def _setup_card(self, artifact_name):
398
+ from metaflow import current
399
+
400
+ results = self._make_reading()
401
+ els = current.card["gpu_profile"]
402
+
403
+ def _drivers():
404
+ els.append(Markdown("## Drivers"))
405
+ els.append(
406
+ Table(
407
+ [[results["cuda_version"], results["driver_version"]]],
408
+ headers=["NVidia driver version", "CUDA version"],
409
+ )
410
+ )
411
+
412
+ def _devices():
413
+ els.append(Markdown("## Devices"))
414
+ rows = [
415
+ [d["device_id"], d["name"], d["memory"]] for d in results["devices"]
416
+ ]
417
+ els.append(Table(rows, headers=["Device ID", "Device type", "GPU memory"]))
418
+
419
+ def _interconnect():
420
+ if results["interconnect"]["data"] and results["interconnect"]["legend"]:
421
+ els.append(Markdown("## Interconnect"))
422
+ interconnect_data = results["interconnect"]["data"]
423
+ rows = list(interconnect_data.values())
424
+ rows = [list(transpose_row) for transpose_row in list(zip(*rows))]
425
+ els.append(Table(rows, headers=list(interconnect_data.keys())))
426
+ els.append(Markdown("#### Legend"))
427
+ els.append(
428
+ Table(
429
+ [list(results["interconnect"]["legend"].values())],
430
+ headers=list(results["interconnect"]["legend"].keys()),
431
+ )
432
+ )
433
+
434
+ def _utilization():
435
+ els.append(Markdown("## Maximum utilization"))
436
+ rows = {}
437
+ for d in results["devices"]:
438
+ rows[d["device_id"]] = {
439
+ "gpu": Markdown("0%"),
440
+ "memory": Markdown("0MB"),
441
+ }
442
+ _rows = [[Markdown(k)] + list(v.values()) for k, v in rows.items()]
443
+ els.append(
444
+ Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
445
+ )
446
+ els.append(
447
+ Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
448
+ )
449
+ return rows
450
+
451
+ def _plots():
452
+ els.append(Markdown("## GPU utilization and memory usage over time"))
453
+
454
+ rows = {}
455
+ for d in results["devices"]:
456
+ gpu_plot, mem_plot, ts_range = profile_plots(
457
+ d["device_id"], [], [], [], []
458
+ )
459
+ rows[d["device_id"]] = {
460
+ "gpu": VegaChart(gpu_plot),
461
+ "memory": VegaChart(mem_plot),
462
+ "reading_duration": Markdown(_get_ts_range(ts_range)),
463
+ }
464
+ for k, v in rows.items():
465
+ els.append(Markdown("### GPU Utilization for device : %s" % k))
466
+ els.append(v["reading_duration"])
467
+ els.append(
468
+ Table(
469
+ data=[
470
+ [Markdown("GPU Utilization"), v["gpu"]],
471
+ [Markdown("Memory usage"), v["memory"]],
472
+ ]
473
+ )
474
+ )
475
+ return rows
476
+
477
+ _drivers()
478
+ _devices()
479
+ _interconnect()
480
+ self._card_comps["max_utilization"] = _utilization()
481
+ self._card_comps["charts"] = _plots()
482
+
483
+ def _read_versions(self):
484
+ def parse(r, s):
485
+ return r.search(s).group(1).strip().decode("utf-8")
486
+
487
+ try:
488
+ out = check_output(["nvidia-smi"])
489
+ return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
490
+ except FileNotFoundError:
491
+ return None, None, "nvidia-smi not found"
492
+ except AttributeError:
493
+ return None, None, "nvidia-smi output is unexpected"
494
+ except:
495
+ return None, None, "nvidia-smi error"
496
+
497
+ def _read_devices(self):
498
+ out = check_output(
499
+ [
500
+ "nvidia-smi",
501
+ "--query-gpu=name,pci.bus_id,memory.total",
502
+ "--format=csv,noheader",
503
+ ]
504
+ )
505
+ return [
506
+ dict(
507
+ zip(("name", "device_id", "memory"), (x.strip() for x in l.split(",")))
508
+ )
509
+ for l in out.decode("utf-8").splitlines()
510
+ ]
511
+
512
+ def _read_multi_gpu_interconnect(self):
513
+ """
514
+ parse output of `nvidia-smi tomo -m`, such as this sample:
515
+
516
+ GPU0 GPU1 CPU Affinity NUMA Affinity
517
+ GPU0 X NV2 0-23 N/A
518
+ GPU1 NV2 X 0-23 N/A
519
+
520
+ returns two dictionaries describing multi-GPU topology:
521
+ data: {index: [GPU0, GPU1, ...], GPU0: [X, NV2, ...], GPU1: [NV2, X, ...], ...}
522
+ legend_items: {X: 'Same PCI', NV2: 'NVLink 2', ...}
523
+ """
524
+ try:
525
+ import re
526
+
527
+ ansi_escape = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
528
+
529
+ out = check_output(["nvidia-smi", "topo", "-m"])
530
+ rows = out.decode("utf-8").split("\n")
531
+
532
+ header = ansi_escape.sub("", rows[0]).split("\t")[1:]
533
+ data = {}
534
+ data["index"] = []
535
+ data |= {k: [] for k in header}
536
+
537
+ for i, row in enumerate(rows[1:]):
538
+ row = ansi_escape.sub("", row).split()
539
+ if len(row) == 0:
540
+ continue
541
+ if row[0].startswith("GPU"):
542
+ data["index"].append(row[0])
543
+ for key, val in zip(header, row[1:]):
544
+ data[key].append(val)
545
+ elif row[0].startswith("Legend"):
546
+ break
547
+
548
+ legend_items = {}
549
+ for legend_row in rows[i:]:
550
+ if legend_row == "" or legend_row.startswith("Legend"):
551
+ continue
552
+ res = legend_row.strip().split(" = ")
553
+ legend_items[res[0].strip()] = res[1].strip()
554
+
555
+ return data, legend_items
556
+
557
+ except:
558
+ return None, None
559
+
560
+
561
+ class gpu_profile:
562
+ def __init__(
563
+ self,
564
+ include_artifacts=True,
565
+ artifact_prefix="gpu_profile_",
566
+ interval=1,
567
+ ):
568
+ self.include_artifacts = include_artifacts
569
+ self.artifact_prefix = artifact_prefix
570
+ self.interval = interval
571
+
572
+ def __call__(self, f):
573
+ @wraps(f)
574
+ def func(s):
575
+ prof = GPUProfiler(interval=self.interval)
576
+ if self.include_artifacts:
577
+ setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
578
+
579
+ current.card["gpu_profile"].append(
580
+ Markdown("# GPU profile for `%s`" % current.pathspec)
581
+ )
582
+ current.card["gpu_profile"].append(
583
+ Markdown(
584
+ "_Started at: %s_"
585
+ % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
586
+ )
587
+ )
588
+ prof._setup_card(self.artifact_prefix + "data")
589
+ current.card["gpu_profile"].refresh()
590
+ update_thread = threading.Thread(target=prof._update_card, daemon=True)
591
+ update_thread.start()
592
+
593
+ try:
594
+ f(s)
595
+ finally:
596
+ try:
597
+ results = prof.finish()
598
+ except:
599
+ results = {"error": "couldn't read profiler results"}
600
+ if self.include_artifacts:
601
+ setattr(s, self.artifact_prefix + "data", results)
602
+
603
+ from metaflow import card
604
+
605
+ return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
606
+ func
607
+ )
608
+
609
+
610
+ def translate_to_vegalite(
611
+ tstamps,
612
+ vals,
613
+ description,
614
+ y_label,
615
+ legend,
616
+ line_color=None,
617
+ percentage_format=False,
618
+ ):
619
+ # Preprocessing for Vega-Lite
620
+ # Assuming tstamps is a list of datetime objects and vals is a list of values
621
+ data = [{"tstamps": str(t), "vals": v} for t, v in zip(tstamps, vals)]
622
+
623
+ # Base Vega-Lite spec
624
+ vega_lite_spec = {
625
+ "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
626
+ "description": description,
627
+ "data": {"values": data},
628
+ "width": 600,
629
+ "height": 400,
630
+ "encoding": {
631
+ "x": {"field": "tstamps", "type": "temporal", "axis": {"title": "Time"}},
632
+ "y": {
633
+ "field": "vals",
634
+ "type": "quantitative",
635
+ "axis": {
636
+ "title": y_label,
637
+ **({"format": "%"} if percentage_format else {}),
638
+ },
639
+ },
640
+ },
641
+ "layer": [
642
+ {
643
+ "mark": {
644
+ "type": "line",
645
+ "color": line_color if line_color else "blue",
646
+ "tooltip": True,
647
+ "description": legend, # Adding legend as description
648
+ },
649
+ "encoding": {"tooltip": [{"field": "tstamps"}, {"field": "vals"}]},
650
+ }
651
+ ],
652
+ }
653
+
654
+ return vega_lite_spec
655
+
656
+
657
+ def profile_plots(device_id, ts, gpu, mem_used, mem_total):
658
+ tstamps = [datetime.strptime(t, NVIDIA_TS_FORMAT) for t in ts]
659
+ gpu = [i / 100 for i in list(map(float, gpu))]
660
+ mem = [float(used) / float(total) for used, total in zip(mem_used, mem_total)]
661
+ time_stamp_range = ""
662
+ if len(tstamps) > 1:
663
+ max_time = max(tstamps).strftime(NVIDIA_TS_FORMAT)
664
+ min_time = min(tstamps).strftime(NVIDIA_TS_FORMAT)
665
+ time_stamp_range = "%s to %s" % (min_time, max_time)
666
+
667
+ gpu_plot = translate_to_vegalite(
668
+ tstamps,
669
+ gpu,
670
+ "GPU utilization",
671
+ "GPU utilization",
672
+ "device: %s" % device_id,
673
+ line_color=GPU_COLOR,
674
+ percentage_format=True,
675
+ )
676
+ mem_plot = translate_to_vegalite(
677
+ tstamps,
678
+ mem,
679
+ "Percentage Memory utilization",
680
+ "Percentage Memory utilization",
681
+ "device: %s" % device_id,
682
+ line_color=MEM_COLOR,
683
+ percentage_format=True,
684
+ )
685
+ return gpu_plot, mem_plot, time_stamp_range
686
+
687
+
688
+ if __name__ == "__main__":
689
+ prof = GPUProfiler(monitor_batch_duration=10)
690
+
691
+ def _write_json_file(data, filename):
692
+ with open(filename, "w") as f:
693
+ json.dump(data, f, indent=4)
694
+
695
+ import time
696
+
697
+ for i in range(15):
698
+ time.sleep(1)
699
+ _write_json_file(prof._monitor.read(), "gpu_profile.json")
700
+
701
+ print(json.dumps(prof.finish()))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.42rc1
3
+ Version: 1.1.43
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -5,9 +5,10 @@ metaflow_extensions/outerbounds/remote_config.py
5
5
  metaflow_extensions/outerbounds/config/__init__.py
6
6
  metaflow_extensions/outerbounds/plugins/__init__.py
7
7
  metaflow_extensions/outerbounds/plugins/auth_server.py
8
- metaflow_extensions/outerbounds/plugins/perimeters.py
9
8
  metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py
10
9
  metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py
10
+ metaflow_extensions/outerbounds/profilers/__init__.py
11
+ metaflow_extensions/outerbounds/profilers/gpu.py
11
12
  metaflow_extensions/outerbounds/toplevel/__init__.py
12
13
  metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py
13
14
  metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.42rc1"
5
+ version = "1.1.43"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8
 
@@ -1,27 +0,0 @@
1
- import os
2
- import fcntl
3
- from os import path
4
- import json
5
-
6
-
7
- def override_metaflow_profile_with_perimeter():
8
- # If OBP_CONFIG_DIR is set, use that, otherwise use METAFLOW_HOME
9
- # If neither are set, use ~/.metaflowconfig
10
- obp_config_dir = path.expanduser(
11
- os.environ.get(
12
- "OBP_CONFIG_DIR", os.environ.get("METAFLOW_HOME", "~/.metaflowconfig")
13
- )
14
- )
15
-
16
- file_path = os.path.join(obp_config_dir, "ob_config.json")
17
-
18
- if os.path.exists(file_path):
19
- # Acquire a shared read lock on the file
20
- fd = os.open(file_path, os.O_RDONLY)
21
- fcntl.flock(fd, fcntl.LOCK_SH)
22
-
23
- with open(file_path, "r") as f:
24
- ob_config = json.loads(f.read())
25
-
26
- if "OB_CURRENT_PERIMETER" in ob_config:
27
- os.environ["METAFLOW_PROFILE"] = ob_config["OB_CURRENT_PERIMETER"]