ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (128) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -7
  2. metaflow_extensions/outerbounds/config/__init__.py +35 -0
  3. metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
  4. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  7. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  35. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  36. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  37. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  38. metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
  39. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  40. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  41. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  42. metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  43. metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
  44. metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
  45. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
  46. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  47. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  48. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  49. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  50. metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  51. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  52. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
  53. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
  54. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
  55. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
  56. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
  57. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  58. metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
  59. metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
  60. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
  61. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  62. metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  63. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  64. metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
  65. metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
  66. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
  67. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
  68. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
  69. metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
  70. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  71. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  72. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  73. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  74. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  75. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  76. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  77. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  78. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  79. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  80. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  81. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  82. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  83. metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
  84. metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
  85. metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
  86. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  87. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  88. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  89. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  90. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  91. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  92. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  93. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  94. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  95. metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  96. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
  97. metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
  98. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
  99. metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  100. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
  101. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
  102. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
  103. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
  104. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
  105. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
  106. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
  107. metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
  108. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  109. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  110. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  111. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  112. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  113. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  114. metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
  115. metaflow_extensions/outerbounds/remote_config.py +53 -16
  116. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
  117. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  118. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  119. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  120. metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
  121. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  122. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  123. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  124. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  125. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  126. ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
  127. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  128. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ import json
13
13
  import sys
14
14
  from tempfile import TemporaryDirectory
15
15
  from subprocess import check_output, Popen
16
+ import subprocess
16
17
  from datetime import datetime, timedelta
17
18
  from functools import wraps
18
19
  from collections import namedtuple
@@ -186,6 +187,9 @@ class GPUMonitor:
186
187
  all_readings = []
187
188
  if self._current_file is None:
188
189
  return None
190
+
191
+ if not os.path.exists(self._current_file):
192
+ return None
189
193
  # Extract everything from the CVS File and store it in a list of dictionaries
190
194
  all_fields = ["gpu_id"] + MONITOR_FIELDS
191
195
  with open(self._current_file, "r") as _monitor_out:
@@ -315,28 +319,48 @@ def _update_charts(results, md_dict):
315
319
 
316
320
  # This code is adapted from: https://github.com/outerbounds/monitorbench
317
321
  class GPUProfiler:
318
- def __init__(self, interval=1, monitor_batch_duration=200):
322
+ def __init__(
323
+ self,
324
+ interval=1,
325
+ monitor_batch_duration=200,
326
+ artifact_name="gpu_profile_data",
327
+ max_check_timeout=60,
328
+ ):
329
+ self._interval = interval
330
+ self.max_check_timeout = max_check_timeout
331
+ self._monitor_batch_duration = monitor_batch_duration
332
+ self.artifact_name = artifact_name
333
+ self._started_at = datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
334
+ self._card_setup_finished = False
335
+ self._card_comps = {
336
+ "max_utilization": {},
337
+ "charts": {},
338
+ "reading_duration": {},
339
+ "error_component": None,
340
+ }
341
+ self._monitor_started = False
342
+ self._start_monitor()
343
+
344
+ def _start_monitor(self):
319
345
  self.driver_ver, self.cuda_ver, self.error = self._read_versions()
320
- (
321
- self.interconnect_data,
322
- self.interconnect_legend,
323
- ) = self._read_multi_gpu_interconnect()
324
346
  if self.error:
325
347
  self.devices = []
326
348
  return
327
349
  else:
350
+ (
351
+ self.interconnect_data,
352
+ self.interconnect_legend,
353
+ ) = self._read_multi_gpu_interconnect()
328
354
  self.devices = self._read_devices()
329
355
  self._monitor = GPUMonitor(
330
- interval=interval, duration=monitor_batch_duration
356
+ interval=self._interval, duration=self._monitor_batch_duration
331
357
  )
332
358
  self._monitor_thread = threading.Thread(
333
359
  target=self._monitor._monitor_update_thread, daemon=True
334
360
  )
361
+ self.error = None
335
362
  self._monitor_thread.start()
336
- self._interval = interval
337
-
338
- self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
339
- self._card_created = False
363
+ self._monitor_started = True
340
364
 
341
365
  def finish(self):
342
366
  ret = {
@@ -376,6 +400,12 @@ class GPUProfiler:
376
400
  def _update_card(self):
377
401
  if len(self.devices) == 0:
378
402
  current.card["gpu_profile"].clear()
403
+ current.card["gpu_profile"].append(
404
+ Markdown("# GPU profile for `%s`" % current.pathspec)
405
+ )
406
+ current.card["gpu_profile"].append(
407
+ Markdown("_Started at: %s_" % self._started_at)
408
+ )
379
409
  current.card["gpu_profile"].append(
380
410
  Markdown("## GPU profile failed: %s" % self.error)
381
411
  )
@@ -383,22 +413,84 @@ class GPUProfiler:
383
413
 
384
414
  return
385
415
 
416
+ _check_time = 0
417
+ stop_checking = False
418
+ # Before writing anything to the card, we need to make sure that:
419
+ # 1. GPU Monitor has started.
420
+ # 2. Monitor can record readings
421
+ # 3. Card is setup
386
422
  while True:
423
+
424
+ if stop_checking:
425
+ time.sleep(self._interval)
426
+ continue
427
+
428
+ # There is a possibility that the `monitor` thread is not started yet
429
+ # because it somehow crashed at the very start.
430
+ if not self._monitor_started and _check_time > self.max_check_timeout:
431
+ current.card["gpu_profile"].clear()
432
+ current.card["gpu_profile"].append(
433
+ Markdown("## GPU profile failed: %s" % self.error)
434
+ )
435
+ current.card["gpu_profile"].refresh()
436
+ stop_checking = True
437
+
438
+ # Try restarting monitor if it hasn't started yet
439
+ if not self._monitor_started:
440
+ self._start_monitor()
441
+ _check_time += self._interval
442
+ time.sleep(self._interval)
443
+ continue
444
+
445
+ # Ensure that we are getting well formatted readings
387
446
  readings = self._make_reading()
447
+
388
448
  if readings is None:
389
449
  print("GPU Profiler readings are none", file=sys.stderr)
390
450
  time.sleep(self._interval)
391
451
  continue
452
+
453
+ # ensure that the card is setup
454
+ if not self._card_setup_finished:
455
+ self._setup_card()
456
+ time.sleep(self._interval)
457
+ continue
458
+
392
459
  _update_utilization(readings, self._card_comps["max_utilization"])
393
460
  _update_charts(readings, self._card_comps["charts"])
394
461
  current.card["gpu_profile"].refresh()
395
462
  time.sleep(self._interval)
396
463
 
397
- def _setup_card(self, artifact_name):
464
+ def _setup_card(self):
398
465
  from metaflow import current
399
466
 
400
467
  results = self._make_reading()
468
+ if "profile" not in results:
469
+ if self._card_comps["error_component"] is None:
470
+ self._card_comps["error_component"] = Markdown(
471
+ "## GPU profile failed: %s" % results["error"]
472
+ )
473
+ current.card["gpu_profile"].append(self._card_comps["error_component"])
474
+ else:
475
+ self._card_comps["error_component"].update(
476
+ Markdown("## GPU profile failed: %s" % results["error"])
477
+ )
478
+ current.card["gpu_profile"].refresh()
479
+ return
480
+
401
481
  els = current.card["gpu_profile"]
482
+ self._card_comps["error_component"] = None
483
+ els.clear()
484
+
485
+ current.card["gpu_profile"].append(
486
+ Markdown("# GPU profile for `%s`" % current.pathspec)
487
+ )
488
+ current.card["gpu_profile"].append(
489
+ Markdown(
490
+ "_Started at: %s_"
491
+ % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
492
+ )
493
+ )
402
494
 
403
495
  def _drivers():
404
496
  els.append(Markdown("## Drivers"))
@@ -444,7 +536,7 @@ class GPUProfiler:
444
536
  Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
445
537
  )
446
538
  els.append(
447
- Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
539
+ Markdown(f"Detailed data saved in an artifact `{self.artifact_name}`")
448
540
  )
449
541
  return rows
450
542
 
@@ -479,20 +571,36 @@ class GPUProfiler:
479
571
  _interconnect()
480
572
  self._card_comps["max_utilization"] = _utilization()
481
573
  self._card_comps["charts"] = _plots()
574
+ self._card_setup_finished = True
482
575
 
483
576
  def _read_versions(self):
484
577
  def parse(r, s):
485
578
  return r.search(s).group(1).strip().decode("utf-8")
486
579
 
487
580
  try:
488
- out = check_output(["nvidia-smi"])
581
+ result = subprocess.run(
582
+ ["nvidia-smi"],
583
+ check=True, # This will raise a CalledProcessError if the command fails
584
+ stdout=subprocess.PIPE, # Capture stdout
585
+ stderr=subprocess.PIPE, # Capture stderr
586
+ )
587
+ # Access the standard output
588
+ out = result.stdout
489
589
  return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
490
590
  except FileNotFoundError:
491
591
  return None, None, "nvidia-smi not found"
492
592
  except AttributeError:
493
593
  return None, None, "nvidia-smi output is unexpected"
494
- except:
495
- return None, None, "nvidia-smi error"
594
+ except subprocess.CalledProcessError as e:
595
+ _error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
596
+ if e.stderr is not None:
597
+ _error_message = (
598
+ "nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
599
+ % (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
600
+ )
601
+ return None, None, _error_message
602
+ except Exception as e:
603
+ return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
496
604
 
497
605
  def _read_devices(self):
498
606
  out = check_output(
@@ -572,39 +680,15 @@ class gpu_profile:
572
680
  def __call__(self, f):
573
681
  @wraps(f)
574
682
  def func(s):
575
- prof = GPUProfiler(interval=self.interval)
576
- if self.include_artifacts:
577
- setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
683
+ return f(s)
578
684
 
579
- current.card["gpu_profile"].append(
580
- Markdown("# GPU profile for `%s`" % current.pathspec)
581
- )
582
- current.card["gpu_profile"].append(
583
- Markdown(
584
- "_Started at: %s_"
585
- % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
586
- )
587
- )
588
- prof._setup_card(self.artifact_prefix + "data")
589
- current.card["gpu_profile"].refresh()
590
- update_thread = threading.Thread(target=prof._update_card, daemon=True)
591
- update_thread.start()
592
-
593
- try:
594
- f(s)
595
- finally:
596
- try:
597
- results = prof.finish()
598
- except:
599
- results = {"error": "couldn't read profiler results"}
600
- if self.include_artifacts:
601
- setattr(s, self.artifact_prefix + "data", results)
602
-
603
- from metaflow import card
604
-
605
- return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
606
- func
607
- )
685
+ from metaflow import gpu_profile
686
+
687
+ return gpu_profile(
688
+ include_artifacts=self.include_artifacts,
689
+ artifact_prefix=self.artifact_prefix,
690
+ interval=self.interval,
691
+ )(func)
608
692
 
609
693
 
610
694
  def translate_to_vegalite(
@@ -8,10 +8,14 @@ import requests
8
8
  from metaflow.exception import MetaflowException
9
9
  from requests.models import HTTPError
10
10
  from metaflow_extensions.outerbounds.plugins.perimeters import (
11
- CURRENT_PERIMETER_KEY,
12
- CURRENT_PERIMETER_URL,
11
+ get_perimeter_config_url_if_set_in_ob_config,
13
12
  )
14
13
 
14
+
15
+ class OuterboundsConfigException(MetaflowException):
16
+ _OB_CONFIG_EXCEPTION = True
17
+
18
+
15
19
  OBP_REMOTE_CONFIG_KEY = "OBP_METAFLOW_CONFIG_URL"
16
20
  HOSTNAME_KEY = "OBP_API_SERVER"
17
21
  AUTH_KEY = "METAFLOW_SERVICE_AUTH_KEY"
@@ -32,7 +36,7 @@ def read_config_from_local() -> Optional[Path]:
32
36
 
33
37
  # we should error because the user wants a specific config
34
38
  if profile:
35
- raise MetaflowException(
39
+ raise OuterboundsConfigException(
36
40
  f"Unable to locate METAFLOW_PROFILE {profile} in {config_path}"
37
41
  )
38
42
 
@@ -55,11 +59,24 @@ def resolve_config_from_remote(remote_url: str, auth_token: str) -> Dict[str, st
55
59
  response.raise_for_status()
56
60
  data = response.json()
57
61
  return data["config"]
58
- except HTTPError:
59
- raise MetaflowException(
60
- "Error fetching resolving configuration. Make sure you have run \
61
- `outerbounds configure` with the correct value"
62
- )
62
+ except HTTPError as e:
63
+ if e.response.status_code >= 500:
64
+ raise OuterboundsConfigException(
65
+ "Error resolving outerbounds configuration [status:%s]. Please reach out to "
66
+ "the outerbounds support team to help resolve this issue."
67
+ % e.response.status_code
68
+ )
69
+ elif e.response.status_code == 403:
70
+ raise OuterboundsConfigException(
71
+ "Outerbounds token validity expired [status:%s]. Please re-run the `outerbounds configure` "
72
+ "command with a new magic string from the UI." % e.response.status_code
73
+ )
74
+ else:
75
+ raise OuterboundsConfigException(
76
+ "Failed to fetch the outerbounds configuration string [status:%s]. Please reach out to "
77
+ "the outerbounds support team to help resolve this issue."
78
+ % e.response.status_code
79
+ )
63
80
 
64
81
 
65
82
  def init_config() -> Dict[str, str]:
@@ -70,8 +87,8 @@ def init_config() -> Dict[str, str]:
70
87
  because steps are executed in subprocesses (local) or environments which expect environment variables to be set.
71
88
  """
72
89
  _init_debug("starting initialization")
73
-
74
- if config_json := os.environ.get(CONFIG_READ_ONCE_KEY):
90
+ config_json = os.environ.get(CONFIG_READ_ONCE_KEY)
91
+ if config_json:
75
92
  _init_debug("reading config from environment")
76
93
  return json.loads(config_json)
77
94
 
@@ -82,18 +99,19 @@ def init_config() -> Dict[str, str]:
82
99
  try:
83
100
  remote_config = json.loads(config_path.read_text())
84
101
  except ValueError:
85
- raise MetaflowException(
86
- "Error decoding your metaflow config. Please run the `outerbounds configure` \
87
- command with the string provided in the Outerbounds dashboard"
102
+ raise OuterboundsConfigException(
103
+ "Error decoding your metaflow config. Please re-run the `outerbounds configure` "
104
+ "command with a new magic string from the UI."
88
105
  )
89
106
 
107
+ perimeter_config_url = get_perimeter_config_url_if_set_in_ob_config()
108
+ if perimeter_config_url:
109
+ remote_config[OBP_REMOTE_CONFIG_KEY] = perimeter_config_url
110
+
90
111
  # users still have a legacy format and that's ok.
91
112
  if OBP_REMOTE_CONFIG_KEY not in remote_config:
92
113
  return remote_config
93
114
 
94
- if CURRENT_PERIMETER_KEY in os.environ and CURRENT_PERIMETER_URL in os.environ:
95
- remote_config[OBP_REMOTE_CONFIG_KEY] = os.environ[CURRENT_PERIMETER_URL]
96
-
97
115
  metaflow_config = resolve_config_from_remote(
98
116
  remote_url=remote_config[OBP_REMOTE_CONFIG_KEY],
99
117
  auth_token=remote_config[AUTH_KEY],
@@ -112,3 +130,22 @@ def _init_debug(*args, **kwargs):
112
130
  init_str = "ob_extension_init:"
113
131
  kwargs["file"] = sys.stderr
114
132
  print(init_str, *args, **kwargs)
133
+
134
+
135
+ def reload_config():
136
+ """
137
+ This function is used to reload the config. Currently its a best effort implementation
138
+ that will only reload auth token.
139
+ """
140
+ _init_debug("reloading config")
141
+ if CONFIG_READ_ONCE_KEY in os.environ:
142
+ del os.environ[CONFIG_READ_ONCE_KEY]
143
+ config = init_config()
144
+ import metaflow.metaflow_config
145
+
146
+ metaflow.metaflow_config.SERVICE_AUTH_KEY = config.get(AUTH_KEY)
147
+ metaflow.metaflow_config.SERVICE_HEADERS["x-api-key"] = config.get(AUTH_KEY)
148
+ if config:
149
+ _init_debug("reloaded config")
150
+ else:
151
+ _init_debug("no config to reload")
@@ -5,6 +5,142 @@
5
5
  __version__ = "v1"
6
6
  __mf_extensions__ = "ob"
7
7
 
8
- # To support "from metaflow import get_aws_client"
9
- from metaflow.plugins.aws.aws_client import get_aws_client
8
+ from metaflow_extensions.outerbounds.toplevel.s3_proxy import (
9
+ get_aws_client_with_s3_proxy,
10
+ get_S3_with_s3_proxy,
11
+ )
12
+
13
+ _S3_PROXY_CONFIG = None
14
+
15
+
16
+ def set_s3_proxy_config(config):
17
+ global _S3_PROXY_CONFIG
18
+ _S3_PROXY_CONFIG = config
19
+
20
+
21
+ def clear_s3_proxy_config():
22
+ global _S3_PROXY_CONFIG
23
+ _S3_PROXY_CONFIG = None
24
+
25
+
26
+ def get_s3_proxy_config():
27
+ global _S3_PROXY_CONFIG
28
+ if _S3_PROXY_CONFIG is None:
29
+ set_s3_proxy_config(get_s3_proxy_config_from_env())
30
+ return _S3_PROXY_CONFIG
31
+
32
+
33
+ # TODO: Refactor out the _S3_PROXY_CONFIG global variable and instead use the function that
34
+ # extracts it from the environment variables.
35
+
36
+ import os
37
+ import json
38
+
39
+
40
+ def get_s3_proxy_config_from_env():
41
+ env_conf = os.environ.get("METAFLOW_S3_PROXY_USER_CODE_CONFIG")
42
+ if env_conf:
43
+ return json.loads(env_conf)
44
+ return None
45
+
46
+
47
+ # Must match the signature of metaflow.plugins.aws.aws_client.get_aws_client
48
+ # This function is called by the "userland" code inside tasks. Metaflow internals
49
+ # will call the function in metaflow.plugins.aws.aws_client.get_aws_client directly.
50
+ #
51
+ # Unlike the original function, this wrapper will use the CSPR role if both of the following
52
+ # conditions are met:
53
+ #
54
+ # 1. CSPR is set
55
+ # 2. user didn't provide a role to assume explicitly.
56
+ #
57
+ def get_aws_client(
58
+ module, with_error=False, role_arn=None, session_vars=None, client_params=None
59
+ ):
60
+ import metaflow.plugins.aws.aws_client
61
+ import os
62
+
63
+ from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
64
+ from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
65
+ OBP_ASSUME_ROLE_ARN_ENV_VAR,
66
+ )
67
+
68
+ # Check if the assume_role decorator has set a role ARN via environment variable
69
+ # This takes precedence over CSPR but not over explicitly passed role_arn
70
+ if role_arn is None:
71
+ decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
72
+ if decorator_role_arn:
73
+ role_arn = decorator_role_arn
74
+
75
+ if module == "s3" and get_s3_proxy_config() is not None:
76
+ return get_aws_client_with_s3_proxy(
77
+ module,
78
+ with_error,
79
+ role_arn,
80
+ session_vars,
81
+ client_params,
82
+ get_s3_proxy_config(),
83
+ )
84
+
85
+ client = metaflow.plugins.aws.aws_client.get_aws_client(
86
+ module,
87
+ with_error=with_error,
88
+ role_arn=role_arn or USE_CSPR_ROLE_ARN_IF_SET,
89
+ session_vars=session_vars,
90
+ client_params=client_params,
91
+ )
92
+
93
+ return client
94
+
95
+
96
+ # This should match the signature of metaflow.plugins.datatools.s3.S3.
97
+ #
98
+ # This assumes that "userland" code inside tasks will call this, while Metaflow
99
+ # internals will call metaflow.plugins.datatools.s3.S3 directly.
100
+ #
101
+ # This wrapper will make S3() use the CSPR role if its set, and user didn't provide
102
+ # a role to assume explicitly.
103
+ def S3(*args, **kwargs):
104
+ import sys
105
+ import metaflow.plugins.datatools.s3
106
+ import os
107
+ from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
108
+ from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
109
+ OBP_ASSUME_ROLE_ARN_ENV_VAR,
110
+ )
111
+
112
+ # Check if the assume_role decorator has set a role ARN via environment variable
113
+ # This takes precedence over CSPR but not over explicitly passed role
114
+ if "role" not in kwargs or kwargs["role"] is None:
115
+ decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
116
+ if decorator_role_arn:
117
+ kwargs["role"] = decorator_role_arn
118
+ else:
119
+ kwargs["role"] = USE_CSPR_ROLE_ARN_IF_SET
120
+
121
+ # Check if S3 proxy is active using module variable (like CSPR)
122
+ if get_s3_proxy_config() is not None:
123
+ return get_S3_with_s3_proxy(get_s3_proxy_config(), *args, **kwargs)
124
+
125
+ return metaflow.plugins.datatools.s3.S3(*args, **kwargs)
126
+
127
+
128
+ # Setting the S3 client docstring in order to ensure that
129
+ # stubs get generated properly.
130
+ import metaflow.plugins.datatools.s3
131
+
132
+ S3.__doc__ = metaflow.plugins.datatools.s3.S3.__doc__
133
+
10
134
  from .. import profilers
135
+ from ..plugins.snowflake import Snowflake
136
+ from ..plugins.checkpoint_datastores import nebius_checkpoints, coreweave_checkpoints
137
+ from ..plugins.aws import assume_role
138
+ from . import ob_internal
139
+ from ..plugins.apps.core import (
140
+ AppDeployer,
141
+ DeployedApp,
142
+ bake_image,
143
+ BakedImage,
144
+ package_code,
145
+ PackagedCode,
146
+ )
@@ -0,0 +1,4 @@
1
+ from ..plugins.kubernetes.pod_killer import PodKiller
2
+ from ..plugins.fast_bakery.baker import bake_image as internal_bake_image
3
+ from ..plugins.apps import core as app_core
4
+ from ..plugins.apps.core import AppDeployer
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.ollama"]
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.optuna"]
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.snowflake"]
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.torchtune"]
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.vllm"]
@@ -0,0 +1,88 @@
1
+ from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
2
+ from metaflow.metaflow_config import AWS_SECRETS_MANAGER_DEFAULT_REGION
3
+ from metaflow_extensions.outerbounds.plugins.s3_proxy.constants import (
4
+ DEFAULT_PROXY_HOST,
5
+ DEFAULT_PROXY_PORT,
6
+ )
7
+
8
+
9
+ def get_aws_client_with_s3_proxy(
10
+ module,
11
+ with_error=False,
12
+ role_arn=None,
13
+ session_vars=None,
14
+ client_params=None,
15
+ s3_config=None,
16
+ ):
17
+ if not client_params:
18
+ client_params = {}
19
+
20
+ client_params["region_name"] = client_params.get(
21
+ "region_name", s3_config.get("region")
22
+ )
23
+ client_params["endpoint_url"] = s3_config.get(
24
+ "endpoint_url", f"http://{DEFAULT_PROXY_HOST}:{DEFAULT_PROXY_PORT}"
25
+ )
26
+
27
+ import metaflow.plugins.aws.aws_client
28
+
29
+ client = metaflow.plugins.aws.aws_client.get_aws_client(
30
+ module,
31
+ with_error=with_error,
32
+ role_arn=role_arn or USE_CSPR_ROLE_ARN_IF_SET,
33
+ session_vars=session_vars,
34
+ client_params=client_params,
35
+ )
36
+
37
+ def override_s3_proxy_host_header(request, **kwargs):
38
+ region = kwargs["region_name"]
39
+ request.headers["Host"] = f"s3.{region}.amazonaws.com"
40
+ if "x-ob-write-to" not in request.headers and "write_mode" in s3_config:
41
+ request.headers["x-ob-write-to"] = s3_config.get("write_mode")
42
+
43
+ client.meta.events.register("before-sign", override_s3_proxy_host_header)
44
+
45
+ return client
46
+
47
+
48
+ def get_S3_with_s3_proxy(s3_config, *args, **kwargs):
49
+ if "region_name" not in kwargs:
50
+ kwargs["region_name"] = s3_config.get(
51
+ "region", AWS_SECRETS_MANAGER_DEFAULT_REGION
52
+ )
53
+
54
+ kwargs["endpoint_url"] = s3_config.get(
55
+ "endpoint_url", f"http://{DEFAULT_PROXY_HOST}:{DEFAULT_PROXY_PORT}"
56
+ )
57
+
58
+ import metaflow.plugins.datatools.s3
59
+
60
+ mf_s3 = metaflow.plugins.datatools.s3.S3(*args, **kwargs)
61
+
62
+ # Override reset_client to ensure proxy endpoint is preserved
63
+ original_reset_client = mf_s3._s3_client.reset_client
64
+
65
+ def proxy_reset_client():
66
+ original_reset_client()
67
+ import boto3
68
+
69
+ proxy_client = boto3.client(
70
+ "s3",
71
+ region_name=kwargs.get("region_name", s3_config.get("region")),
72
+ endpoint_url=s3_config.get("endpoint_url"),
73
+ )
74
+ mf_s3._s3_client._s3_client = proxy_client
75
+
76
+ mf_s3._s3_client.reset_client = proxy_reset_client
77
+ mf_s3._s3_client.reset_client()
78
+
79
+ def override_s3_proxy_host_header(request, **kwargs):
80
+ region = kwargs["region_name"]
81
+ request.headers["Host"] = f"s3.{region}.amazonaws.com"
82
+ if "x-ob-write-to" not in request.headers and "write_mode" in s3_config:
83
+ request.headers["x-ob-write-to"] = s3_config.get("write_mode")
84
+
85
+ mf_s3._s3_client._s3_client.meta.events.register(
86
+ "before-sign", override_s3_proxy_host_header
87
+ )
88
+ return mf_s3
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.45rc3
3
+ Version: 1.5.1
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: boto3
9
9
  Requires-Dist: kubernetes
10
- Requires-Dist: ob-metaflow (==2.11.0.4)
10
+ Requires-Dist: ob-metaflow (==2.19.15.3)
11
11
 
12
12
  # Outerbounds platform package
13
13