ob-metaflow-extensions 1.1.100rc0__tar.gz → 1.1.102__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +8 -6
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/profilers/gpu.py +54 -8
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- ob-metaflow-extensions-1.1.102/ob_metaflow_extensions.egg-info/requires.txt +3 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/setup.py +2 -2
- ob-metaflow-extensions-1.1.100rc0/ob_metaflow_extensions.egg-info/requires.txt +0 -3
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/README.md +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.102}/setup.cfg +0 -0
|
@@ -319,15 +319,17 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
319
319
|
config.append("--disable=F0401")
|
|
320
320
|
return config
|
|
321
321
|
|
|
322
|
+
def get_package_commands(self, codepackage_url, datastore_type):
|
|
323
|
+
# we must set the skip install flag at this stage in order to skip package downloads,
|
|
324
|
+
# doing so in bootstrap_commands is too late in the lifecycle.
|
|
325
|
+
return [
|
|
326
|
+
"export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
|
|
327
|
+
] + super().get_package_commands(codepackage_url, datastore_type)
|
|
328
|
+
|
|
322
329
|
def bootstrap_commands(self, step_name, datastore_type):
|
|
323
330
|
if step_name in self.skipped_steps:
|
|
324
331
|
return self.delegate.bootstrap_commands(step_name, datastore_type)
|
|
325
|
-
|
|
326
|
-
# we set the environment flag for skipping bootstrap dependencies, as these are
|
|
327
|
-
# provided in all baked images.
|
|
328
|
-
return [
|
|
329
|
-
"export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
|
|
330
|
-
] + super().bootstrap_commands(step_name, datastore_type)
|
|
332
|
+
return super().bootstrap_commands(step_name, datastore_type)
|
|
331
333
|
|
|
332
334
|
|
|
333
335
|
def get_fastbakery_metafile_path(local_root, flow_name):
|
|
@@ -13,6 +13,7 @@ import json
|
|
|
13
13
|
import sys
|
|
14
14
|
from tempfile import TemporaryDirectory
|
|
15
15
|
from subprocess import check_output, Popen
|
|
16
|
+
import subprocess
|
|
16
17
|
from datetime import datetime, timedelta
|
|
17
18
|
from functools import wraps
|
|
18
19
|
from collections import namedtuple
|
|
@@ -318,8 +319,20 @@ class GPUProfiler:
|
|
|
318
319
|
def __init__(
|
|
319
320
|
self, interval=1, monitor_batch_duration=200, artifact_name="gpu_profile_data"
|
|
320
321
|
):
|
|
322
|
+
self._interval = interval
|
|
323
|
+
self._monitor_batch_duration = monitor_batch_duration
|
|
321
324
|
self.artifact_name = artifact_name
|
|
322
325
|
self._card_setup_finished = False
|
|
326
|
+
self._card_comps = {
|
|
327
|
+
"max_utilization": {},
|
|
328
|
+
"charts": {},
|
|
329
|
+
"reading_duration": {},
|
|
330
|
+
"error_component": None,
|
|
331
|
+
}
|
|
332
|
+
self._monitor_started = False
|
|
333
|
+
self._start_monitor()
|
|
334
|
+
|
|
335
|
+
def _start_monitor(self):
|
|
323
336
|
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
324
337
|
(
|
|
325
338
|
self.interconnect_data,
|
|
@@ -331,16 +344,14 @@ class GPUProfiler:
|
|
|
331
344
|
else:
|
|
332
345
|
self.devices = self._read_devices()
|
|
333
346
|
self._monitor = GPUMonitor(
|
|
334
|
-
interval=
|
|
347
|
+
interval=self._interval, duration=self._monitor_batch_duration
|
|
335
348
|
)
|
|
336
349
|
self._monitor_thread = threading.Thread(
|
|
337
350
|
target=self._monitor._monitor_update_thread, daemon=True
|
|
338
351
|
)
|
|
352
|
+
self.error = None
|
|
339
353
|
self._monitor_thread.start()
|
|
340
|
-
self.
|
|
341
|
-
|
|
342
|
-
self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
|
|
343
|
-
self._card_created = False
|
|
354
|
+
self._monitor_started = True
|
|
344
355
|
|
|
345
356
|
def finish(self):
|
|
346
357
|
ret = {
|
|
@@ -388,11 +399,20 @@ class GPUProfiler:
|
|
|
388
399
|
return
|
|
389
400
|
|
|
390
401
|
while True:
|
|
402
|
+
# There is a possibility that the `monitor` thread is not started yet
|
|
403
|
+
# because it somehow crashed at the very start.
|
|
404
|
+
if not self._monitor_started:
|
|
405
|
+
self._start_monitor()
|
|
406
|
+
time.sleep(self._interval)
|
|
407
|
+
continue
|
|
408
|
+
|
|
391
409
|
readings = self._make_reading()
|
|
410
|
+
|
|
392
411
|
if readings is None:
|
|
393
412
|
print("GPU Profiler readings are none", file=sys.stderr)
|
|
394
413
|
time.sleep(self._interval)
|
|
395
414
|
continue
|
|
415
|
+
|
|
396
416
|
if not self._card_setup_finished:
|
|
397
417
|
self._setup_card()
|
|
398
418
|
time.sleep(self._interval)
|
|
@@ -408,9 +428,20 @@ class GPUProfiler:
|
|
|
408
428
|
|
|
409
429
|
results = self._make_reading()
|
|
410
430
|
if "profile" not in results:
|
|
431
|
+
if self._card_comps["error_component"] is None:
|
|
432
|
+
self._card_comps["error_component"] = Markdown(
|
|
433
|
+
"## GPU profile failed: %s" % results["error"]
|
|
434
|
+
)
|
|
435
|
+
current.card["gpu_profile"].append(self._card_comps["error_component"])
|
|
436
|
+
else:
|
|
437
|
+
self._card_comps["error_component"].update(
|
|
438
|
+
Markdown("## GPU profile failed: %s" % results["error"])
|
|
439
|
+
)
|
|
440
|
+
current.card["gpu_profile"].refresh()
|
|
411
441
|
return
|
|
412
442
|
|
|
413
443
|
els = current.card["gpu_profile"]
|
|
444
|
+
self._card_comps["error_component"] = None
|
|
414
445
|
els.clear()
|
|
415
446
|
|
|
416
447
|
current.card["gpu_profile"].append(
|
|
@@ -509,14 +540,29 @@ class GPUProfiler:
|
|
|
509
540
|
return r.search(s).group(1).strip().decode("utf-8")
|
|
510
541
|
|
|
511
542
|
try:
|
|
512
|
-
|
|
543
|
+
result = subprocess.run(
|
|
544
|
+
["nvidia-smi"],
|
|
545
|
+
check=True, # This will raise a CalledProcessError if the command fails
|
|
546
|
+
stdout=subprocess.PIPE, # Capture stdout
|
|
547
|
+
stderr=subprocess.PIPE, # Capture stderr
|
|
548
|
+
)
|
|
549
|
+
# Access the standard output
|
|
550
|
+
out = result.stdout
|
|
513
551
|
return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
|
|
514
552
|
except FileNotFoundError:
|
|
515
553
|
return None, None, "nvidia-smi not found"
|
|
516
554
|
except AttributeError:
|
|
517
555
|
return None, None, "nvidia-smi output is unexpected"
|
|
518
|
-
except:
|
|
519
|
-
|
|
556
|
+
except subprocess.CalledProcessError as e:
|
|
557
|
+
_error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
|
|
558
|
+
if e.stderr is not None:
|
|
559
|
+
_error_message = (
|
|
560
|
+
"nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
|
|
561
|
+
% (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
|
|
562
|
+
)
|
|
563
|
+
return None, None, _error_message
|
|
564
|
+
except Exception as e:
|
|
565
|
+
return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
|
|
520
566
|
|
|
521
567
|
def _read_devices(self):
|
|
522
568
|
out = check_output(
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
version = "1.1.
|
|
5
|
+
version = "1.1.102"
|
|
6
6
|
this_directory = Path(__file__).parent
|
|
7
7
|
long_description = (this_directory / "README.md").read_text()
|
|
8
8
|
|
|
@@ -18,6 +18,6 @@ setup(
|
|
|
18
18
|
install_requires=[
|
|
19
19
|
"boto3",
|
|
20
20
|
"kubernetes",
|
|
21
|
-
"ob-metaflow == 2.12.
|
|
21
|
+
"ob-metaflow == 2.12.27.1",
|
|
22
22
|
],
|
|
23
23
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|