ob-metaflow-extensions 1.1.100rc0__tar.gz → 1.1.101__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (46) hide show
  1. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +8 -6
  3. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/profilers/gpu.py +54 -8
  4. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  5. ob-metaflow-extensions-1.1.101/ob_metaflow_extensions.egg-info/requires.txt +3 -0
  6. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/setup.py +2 -2
  7. ob-metaflow-extensions-1.1.100rc0/ob_metaflow_extensions.egg-info/requires.txt +0 -3
  8. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/README.md +0 -0
  9. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/__init__.py +0 -0
  10. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  11. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  12. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  13. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  14. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  15. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  16. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  17. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  18. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  19. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
  20. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  21. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  22. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
  23. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
  24. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
  25. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  26. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  27. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  28. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
  29. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  30. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  31. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  32. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  33. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  34. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  35. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
  36. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  37. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  38. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  39. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  40. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  41. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  42. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  43. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
  44. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  45. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  46. {ob-metaflow-extensions-1.1.100rc0 → ob-metaflow-extensions-1.1.101}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.100rc0
3
+ Version: 1.1.101
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -319,15 +319,17 @@ class DockerEnvironment(MetaflowEnvironment):
319
319
  config.append("--disable=F0401")
320
320
  return config
321
321
 
322
+ def get_package_commands(self, codepackage_url, datastore_type):
323
+ # we must set the skip install flag at this stage in order to skip package downloads,
324
+ # doing so in bootstrap_commands is too late in the lifecycle.
325
+ return [
326
+ "export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
327
+ ] + super().get_package_commands(codepackage_url, datastore_type)
328
+
322
329
  def bootstrap_commands(self, step_name, datastore_type):
323
330
  if step_name in self.skipped_steps:
324
331
  return self.delegate.bootstrap_commands(step_name, datastore_type)
325
- # Bootstrap conda and execution environment for step
326
- # we set the environment flag for skipping bootstrap dependencies, as these are
327
- # provided in all baked images.
328
- return [
329
- "export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
330
- ] + super().bootstrap_commands(step_name, datastore_type)
332
+ return super().bootstrap_commands(step_name, datastore_type)
331
333
 
332
334
 
333
335
  def get_fastbakery_metafile_path(local_root, flow_name):
@@ -13,6 +13,7 @@ import json
13
13
  import sys
14
14
  from tempfile import TemporaryDirectory
15
15
  from subprocess import check_output, Popen
16
+ import subprocess
16
17
  from datetime import datetime, timedelta
17
18
  from functools import wraps
18
19
  from collections import namedtuple
@@ -318,8 +319,20 @@ class GPUProfiler:
318
319
  def __init__(
319
320
  self, interval=1, monitor_batch_duration=200, artifact_name="gpu_profile_data"
320
321
  ):
322
+ self._interval = interval
323
+ self._monitor_batch_duration = monitor_batch_duration
321
324
  self.artifact_name = artifact_name
322
325
  self._card_setup_finished = False
326
+ self._card_comps = {
327
+ "max_utilization": {},
328
+ "charts": {},
329
+ "reading_duration": {},
330
+ "error_component": None,
331
+ }
332
+ self._monitor_started = False
333
+ self._start_monitor()
334
+
335
+ def _start_monitor(self):
323
336
  self.driver_ver, self.cuda_ver, self.error = self._read_versions()
324
337
  (
325
338
  self.interconnect_data,
@@ -331,16 +344,14 @@ class GPUProfiler:
331
344
  else:
332
345
  self.devices = self._read_devices()
333
346
  self._monitor = GPUMonitor(
334
- interval=interval, duration=monitor_batch_duration
347
+ interval=self._interval, duration=self._monitor_batch_duration
335
348
  )
336
349
  self._monitor_thread = threading.Thread(
337
350
  target=self._monitor._monitor_update_thread, daemon=True
338
351
  )
352
+ self.error = None
339
353
  self._monitor_thread.start()
340
- self._interval = interval
341
-
342
- self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
343
- self._card_created = False
354
+ self._monitor_started = True
344
355
 
345
356
  def finish(self):
346
357
  ret = {
@@ -388,11 +399,20 @@ class GPUProfiler:
388
399
  return
389
400
 
390
401
  while True:
402
+ # There is a possibility that the `monitor` thread is not started yet
403
+ # because it somehow crashed at the very start.
404
+ if not self._monitor_started:
405
+ self._start_monitor()
406
+ time.sleep(self._interval)
407
+ continue
408
+
391
409
  readings = self._make_reading()
410
+
392
411
  if readings is None:
393
412
  print("GPU Profiler readings are none", file=sys.stderr)
394
413
  time.sleep(self._interval)
395
414
  continue
415
+
396
416
  if not self._card_setup_finished:
397
417
  self._setup_card()
398
418
  time.sleep(self._interval)
@@ -408,9 +428,20 @@ class GPUProfiler:
408
428
 
409
429
  results = self._make_reading()
410
430
  if "profile" not in results:
431
+ if self._card_comps["error_component"] is None:
432
+ self._card_comps["error_component"] = Markdown(
433
+ "## GPU profile failed: %s" % results["error"]
434
+ )
435
+ current.card["gpu_profile"].append(self._card_comps["error_component"])
436
+ else:
437
+ self._card_comps["error_component"].update(
438
+ Markdown("## GPU profile failed: %s" % results["error"])
439
+ )
440
+ current.card["gpu_profile"].refresh()
411
441
  return
412
442
 
413
443
  els = current.card["gpu_profile"]
444
+ self._card_comps["error_component"] = None
414
445
  els.clear()
415
446
 
416
447
  current.card["gpu_profile"].append(
@@ -509,14 +540,29 @@ class GPUProfiler:
509
540
  return r.search(s).group(1).strip().decode("utf-8")
510
541
 
511
542
  try:
512
- out = check_output(["nvidia-smi"])
543
+ result = subprocess.run(
544
+ ["nvidia-smi"],
545
+ check=True, # This will raise a CalledProcessError if the command fails
546
+ stdout=subprocess.PIPE, # Capture stdout
547
+ stderr=subprocess.PIPE, # Capture stderr
548
+ )
549
+ # Access the standard output
550
+ out = result.stdout
513
551
  return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
514
552
  except FileNotFoundError:
515
553
  return None, None, "nvidia-smi not found"
516
554
  except AttributeError:
517
555
  return None, None, "nvidia-smi output is unexpected"
518
- except:
519
- return None, None, "nvidia-smi error"
556
+ except subprocess.CalledProcessError as e:
557
+ _error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
558
+ if e.stderr is not None:
559
+ _error_message = (
560
+ "nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
561
+ % (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
562
+ )
563
+ return None, None, _error_message
564
+ except Exception as e:
565
+ return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
520
566
 
521
567
  def _read_devices(self):
522
568
  out = check_output(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.100rc0
3
+ Version: 1.1.101
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -0,0 +1,3 @@
1
+ boto3
2
+ kubernetes
3
+ ob-metaflow==2.12.26.1
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.100rc0"
5
+ version = "1.1.101"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8
 
@@ -18,6 +18,6 @@ setup(
18
18
  install_requires=[
19
19
  "boto3",
20
20
  "kubernetes",
21
- "ob-metaflow == 2.12.25.2",
21
+ "ob-metaflow == 2.12.26.1",
22
22
  ],
23
23
  )
@@ -1,3 +0,0 @@
1
- boto3
2
- kubernetes
3
- ob-metaflow==2.12.25.2