ob-metaflow-extensions 1.1.99__tar.gz → 1.1.100__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/profilers/gpu.py +85 -13
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/setup.py +1 -1
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/README.md +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.99 → ob-metaflow-extensions-1.1.100}/setup.cfg +0 -0
|
@@ -13,6 +13,7 @@ import json
|
|
|
13
13
|
import sys
|
|
14
14
|
from tempfile import TemporaryDirectory
|
|
15
15
|
from subprocess import check_output, Popen
|
|
16
|
+
import subprocess
|
|
16
17
|
from datetime import datetime, timedelta
|
|
17
18
|
from functools import wraps
|
|
18
19
|
from collections import namedtuple
|
|
@@ -315,7 +316,23 @@ def _update_charts(results, md_dict):
|
|
|
315
316
|
|
|
316
317
|
# This code is adapted from: https://github.com/outerbounds/monitorbench
|
|
317
318
|
class GPUProfiler:
|
|
318
|
-
def __init__(
|
|
319
|
+
def __init__(
|
|
320
|
+
self, interval=1, monitor_batch_duration=200, artifact_name="gpu_profile_data"
|
|
321
|
+
):
|
|
322
|
+
self._interval = interval
|
|
323
|
+
self._monitor_batch_duration = monitor_batch_duration
|
|
324
|
+
self.artifact_name = artifact_name
|
|
325
|
+
self._card_setup_finished = False
|
|
326
|
+
self._card_comps = {
|
|
327
|
+
"max_utilization": {},
|
|
328
|
+
"charts": {},
|
|
329
|
+
"reading_duration": {},
|
|
330
|
+
"error_component": None,
|
|
331
|
+
}
|
|
332
|
+
self._monitor_started = False
|
|
333
|
+
self._start_monitor()
|
|
334
|
+
|
|
335
|
+
def _start_monitor(self):
|
|
319
336
|
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
320
337
|
(
|
|
321
338
|
self.interconnect_data,
|
|
@@ -327,16 +344,14 @@ class GPUProfiler:
|
|
|
327
344
|
else:
|
|
328
345
|
self.devices = self._read_devices()
|
|
329
346
|
self._monitor = GPUMonitor(
|
|
330
|
-
interval=
|
|
347
|
+
interval=self._interval, duration=self._monitor_batch_duration
|
|
331
348
|
)
|
|
332
349
|
self._monitor_thread = threading.Thread(
|
|
333
350
|
target=self._monitor._monitor_update_thread, daemon=True
|
|
334
351
|
)
|
|
352
|
+
self.error = None
|
|
335
353
|
self._monitor_thread.start()
|
|
336
|
-
self.
|
|
337
|
-
|
|
338
|
-
self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
|
|
339
|
-
self._card_created = False
|
|
354
|
+
self._monitor_started = True
|
|
340
355
|
|
|
341
356
|
def finish(self):
|
|
342
357
|
ret = {
|
|
@@ -384,21 +399,60 @@ class GPUProfiler:
|
|
|
384
399
|
return
|
|
385
400
|
|
|
386
401
|
while True:
|
|
402
|
+
# There is a possibility that the `monitor` thread is not started yet
|
|
403
|
+
# because it somehow crashed at the very start.
|
|
404
|
+
if not self._monitor_started:
|
|
405
|
+
self._start_monitor()
|
|
406
|
+
time.sleep(self._interval)
|
|
407
|
+
continue
|
|
408
|
+
|
|
387
409
|
readings = self._make_reading()
|
|
410
|
+
|
|
388
411
|
if readings is None:
|
|
389
412
|
print("GPU Profiler readings are none", file=sys.stderr)
|
|
390
413
|
time.sleep(self._interval)
|
|
391
414
|
continue
|
|
415
|
+
|
|
416
|
+
if not self._card_setup_finished:
|
|
417
|
+
self._setup_card()
|
|
418
|
+
time.sleep(self._interval)
|
|
419
|
+
continue
|
|
420
|
+
|
|
392
421
|
_update_utilization(readings, self._card_comps["max_utilization"])
|
|
393
422
|
_update_charts(readings, self._card_comps["charts"])
|
|
394
423
|
current.card["gpu_profile"].refresh()
|
|
395
424
|
time.sleep(self._interval)
|
|
396
425
|
|
|
397
|
-
def _setup_card(self
|
|
426
|
+
def _setup_card(self):
|
|
398
427
|
from metaflow import current
|
|
399
428
|
|
|
400
429
|
results = self._make_reading()
|
|
430
|
+
if "profile" not in results:
|
|
431
|
+
if self._card_comps["error_component"] is None:
|
|
432
|
+
self._card_comps["error_component"] = Markdown(
|
|
433
|
+
"## GPU profile failed: %s" % results["error"]
|
|
434
|
+
)
|
|
435
|
+
current.card["gpu_profile"].append(self._card_comps["error_component"])
|
|
436
|
+
else:
|
|
437
|
+
self._card_comps["error_component"].update(
|
|
438
|
+
Markdown("## GPU profile failed: %s" % results["error"])
|
|
439
|
+
)
|
|
440
|
+
current.card["gpu_profile"].refresh()
|
|
441
|
+
return
|
|
442
|
+
|
|
401
443
|
els = current.card["gpu_profile"]
|
|
444
|
+
self._card_comps["error_component"] = None
|
|
445
|
+
els.clear()
|
|
446
|
+
|
|
447
|
+
current.card["gpu_profile"].append(
|
|
448
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
449
|
+
)
|
|
450
|
+
current.card["gpu_profile"].append(
|
|
451
|
+
Markdown(
|
|
452
|
+
"_Started at: %s_"
|
|
453
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
454
|
+
)
|
|
455
|
+
)
|
|
402
456
|
|
|
403
457
|
def _drivers():
|
|
404
458
|
els.append(Markdown("## Drivers"))
|
|
@@ -444,7 +498,7 @@ class GPUProfiler:
|
|
|
444
498
|
Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
|
|
445
499
|
)
|
|
446
500
|
els.append(
|
|
447
|
-
Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
|
|
501
|
+
Markdown(f"Detailed data saved in an artifact `{self.artifact_name}`")
|
|
448
502
|
)
|
|
449
503
|
return rows
|
|
450
504
|
|
|
@@ -479,20 +533,36 @@ class GPUProfiler:
|
|
|
479
533
|
_interconnect()
|
|
480
534
|
self._card_comps["max_utilization"] = _utilization()
|
|
481
535
|
self._card_comps["charts"] = _plots()
|
|
536
|
+
self._card_setup_finished = True
|
|
482
537
|
|
|
483
538
|
def _read_versions(self):
|
|
484
539
|
def parse(r, s):
|
|
485
540
|
return r.search(s).group(1).strip().decode("utf-8")
|
|
486
541
|
|
|
487
542
|
try:
|
|
488
|
-
|
|
543
|
+
result = subprocess.run(
|
|
544
|
+
["nvidia-smi"],
|
|
545
|
+
check=True, # This will raise a CalledProcessError if the command fails
|
|
546
|
+
stdout=subprocess.PIPE, # Capture stdout
|
|
547
|
+
stderr=subprocess.PIPE, # Capture stderr
|
|
548
|
+
)
|
|
549
|
+
# Access the standard output
|
|
550
|
+
out = result.stdout
|
|
489
551
|
return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
|
|
490
552
|
except FileNotFoundError:
|
|
491
553
|
return None, None, "nvidia-smi not found"
|
|
492
554
|
except AttributeError:
|
|
493
555
|
return None, None, "nvidia-smi output is unexpected"
|
|
494
|
-
except:
|
|
495
|
-
|
|
556
|
+
except subprocess.CalledProcessError as e:
|
|
557
|
+
_error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
|
|
558
|
+
if e.stderr is not None:
|
|
559
|
+
_error_message = (
|
|
560
|
+
"nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
|
|
561
|
+
% (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
|
|
562
|
+
)
|
|
563
|
+
return None, None, _error_message
|
|
564
|
+
except Exception as e:
|
|
565
|
+
return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
|
|
496
566
|
|
|
497
567
|
def _read_devices(self):
|
|
498
568
|
out = check_output(
|
|
@@ -572,7 +642,9 @@ class gpu_profile:
|
|
|
572
642
|
def __call__(self, f):
|
|
573
643
|
@wraps(f)
|
|
574
644
|
def func(s):
|
|
575
|
-
prof = GPUProfiler(
|
|
645
|
+
prof = GPUProfiler(
|
|
646
|
+
interval=self.interval, artifact_name=self.artifact_prefix + "data"
|
|
647
|
+
)
|
|
576
648
|
if self.include_artifacts:
|
|
577
649
|
setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
|
|
578
650
|
|
|
@@ -585,7 +657,7 @@ class gpu_profile:
|
|
|
585
657
|
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
586
658
|
)
|
|
587
659
|
)
|
|
588
|
-
prof._setup_card(
|
|
660
|
+
prof._setup_card()
|
|
589
661
|
current.card["gpu_profile"].refresh()
|
|
590
662
|
update_thread = threading.Thread(target=prof._update_card, daemon=True)
|
|
591
663
|
update_thread.start()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|