ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -7
- metaflow_extensions/outerbounds/config/__init__.py +35 -0
- metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
- metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
- metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
- metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
- metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
- metaflow_extensions/outerbounds/remote_config.py +53 -16
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@ import json
|
|
|
13
13
|
import sys
|
|
14
14
|
from tempfile import TemporaryDirectory
|
|
15
15
|
from subprocess import check_output, Popen
|
|
16
|
+
import subprocess
|
|
16
17
|
from datetime import datetime, timedelta
|
|
17
18
|
from functools import wraps
|
|
18
19
|
from collections import namedtuple
|
|
@@ -186,6 +187,9 @@ class GPUMonitor:
|
|
|
186
187
|
all_readings = []
|
|
187
188
|
if self._current_file is None:
|
|
188
189
|
return None
|
|
190
|
+
|
|
191
|
+
if not os.path.exists(self._current_file):
|
|
192
|
+
return None
|
|
189
193
|
# Extract everything from the CVS File and store it in a list of dictionaries
|
|
190
194
|
all_fields = ["gpu_id"] + MONITOR_FIELDS
|
|
191
195
|
with open(self._current_file, "r") as _monitor_out:
|
|
@@ -315,28 +319,48 @@ def _update_charts(results, md_dict):
|
|
|
315
319
|
|
|
316
320
|
# This code is adapted from: https://github.com/outerbounds/monitorbench
|
|
317
321
|
class GPUProfiler:
|
|
318
|
-
def __init__(
|
|
322
|
+
def __init__(
|
|
323
|
+
self,
|
|
324
|
+
interval=1,
|
|
325
|
+
monitor_batch_duration=200,
|
|
326
|
+
artifact_name="gpu_profile_data",
|
|
327
|
+
max_check_timeout=60,
|
|
328
|
+
):
|
|
329
|
+
self._interval = interval
|
|
330
|
+
self.max_check_timeout = max_check_timeout
|
|
331
|
+
self._monitor_batch_duration = monitor_batch_duration
|
|
332
|
+
self.artifact_name = artifact_name
|
|
333
|
+
self._started_at = datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
334
|
+
self._card_setup_finished = False
|
|
335
|
+
self._card_comps = {
|
|
336
|
+
"max_utilization": {},
|
|
337
|
+
"charts": {},
|
|
338
|
+
"reading_duration": {},
|
|
339
|
+
"error_component": None,
|
|
340
|
+
}
|
|
341
|
+
self._monitor_started = False
|
|
342
|
+
self._start_monitor()
|
|
343
|
+
|
|
344
|
+
def _start_monitor(self):
|
|
319
345
|
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
320
|
-
(
|
|
321
|
-
self.interconnect_data,
|
|
322
|
-
self.interconnect_legend,
|
|
323
|
-
) = self._read_multi_gpu_interconnect()
|
|
324
346
|
if self.error:
|
|
325
347
|
self.devices = []
|
|
326
348
|
return
|
|
327
349
|
else:
|
|
350
|
+
(
|
|
351
|
+
self.interconnect_data,
|
|
352
|
+
self.interconnect_legend,
|
|
353
|
+
) = self._read_multi_gpu_interconnect()
|
|
328
354
|
self.devices = self._read_devices()
|
|
329
355
|
self._monitor = GPUMonitor(
|
|
330
|
-
interval=
|
|
356
|
+
interval=self._interval, duration=self._monitor_batch_duration
|
|
331
357
|
)
|
|
332
358
|
self._monitor_thread = threading.Thread(
|
|
333
359
|
target=self._monitor._monitor_update_thread, daemon=True
|
|
334
360
|
)
|
|
361
|
+
self.error = None
|
|
335
362
|
self._monitor_thread.start()
|
|
336
|
-
self.
|
|
337
|
-
|
|
338
|
-
self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
|
|
339
|
-
self._card_created = False
|
|
363
|
+
self._monitor_started = True
|
|
340
364
|
|
|
341
365
|
def finish(self):
|
|
342
366
|
ret = {
|
|
@@ -376,6 +400,12 @@ class GPUProfiler:
|
|
|
376
400
|
def _update_card(self):
|
|
377
401
|
if len(self.devices) == 0:
|
|
378
402
|
current.card["gpu_profile"].clear()
|
|
403
|
+
current.card["gpu_profile"].append(
|
|
404
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
405
|
+
)
|
|
406
|
+
current.card["gpu_profile"].append(
|
|
407
|
+
Markdown("_Started at: %s_" % self._started_at)
|
|
408
|
+
)
|
|
379
409
|
current.card["gpu_profile"].append(
|
|
380
410
|
Markdown("## GPU profile failed: %s" % self.error)
|
|
381
411
|
)
|
|
@@ -383,22 +413,84 @@ class GPUProfiler:
|
|
|
383
413
|
|
|
384
414
|
return
|
|
385
415
|
|
|
416
|
+
_check_time = 0
|
|
417
|
+
stop_checking = False
|
|
418
|
+
# Before writing anything to the card, we need to make sure that:
|
|
419
|
+
# 1. GPU Monitor has started.
|
|
420
|
+
# 2. Monitor can record readings
|
|
421
|
+
# 3. Card is setup
|
|
386
422
|
while True:
|
|
423
|
+
|
|
424
|
+
if stop_checking:
|
|
425
|
+
time.sleep(self._interval)
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
# There is a possibility that the `monitor` thread is not started yet
|
|
429
|
+
# because it somehow crashed at the very start.
|
|
430
|
+
if not self._monitor_started and _check_time > self.max_check_timeout:
|
|
431
|
+
current.card["gpu_profile"].clear()
|
|
432
|
+
current.card["gpu_profile"].append(
|
|
433
|
+
Markdown("## GPU profile failed: %s" % self.error)
|
|
434
|
+
)
|
|
435
|
+
current.card["gpu_profile"].refresh()
|
|
436
|
+
stop_checking = True
|
|
437
|
+
|
|
438
|
+
# Try restarting monitor if it hasn't started yet
|
|
439
|
+
if not self._monitor_started:
|
|
440
|
+
self._start_monitor()
|
|
441
|
+
_check_time += self._interval
|
|
442
|
+
time.sleep(self._interval)
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
# Ensure that we are getting well formatted readings
|
|
387
446
|
readings = self._make_reading()
|
|
447
|
+
|
|
388
448
|
if readings is None:
|
|
389
449
|
print("GPU Profiler readings are none", file=sys.stderr)
|
|
390
450
|
time.sleep(self._interval)
|
|
391
451
|
continue
|
|
452
|
+
|
|
453
|
+
# ensure that the card is setup
|
|
454
|
+
if not self._card_setup_finished:
|
|
455
|
+
self._setup_card()
|
|
456
|
+
time.sleep(self._interval)
|
|
457
|
+
continue
|
|
458
|
+
|
|
392
459
|
_update_utilization(readings, self._card_comps["max_utilization"])
|
|
393
460
|
_update_charts(readings, self._card_comps["charts"])
|
|
394
461
|
current.card["gpu_profile"].refresh()
|
|
395
462
|
time.sleep(self._interval)
|
|
396
463
|
|
|
397
|
-
def _setup_card(self
|
|
464
|
+
def _setup_card(self):
|
|
398
465
|
from metaflow import current
|
|
399
466
|
|
|
400
467
|
results = self._make_reading()
|
|
468
|
+
if "profile" not in results:
|
|
469
|
+
if self._card_comps["error_component"] is None:
|
|
470
|
+
self._card_comps["error_component"] = Markdown(
|
|
471
|
+
"## GPU profile failed: %s" % results["error"]
|
|
472
|
+
)
|
|
473
|
+
current.card["gpu_profile"].append(self._card_comps["error_component"])
|
|
474
|
+
else:
|
|
475
|
+
self._card_comps["error_component"].update(
|
|
476
|
+
Markdown("## GPU profile failed: %s" % results["error"])
|
|
477
|
+
)
|
|
478
|
+
current.card["gpu_profile"].refresh()
|
|
479
|
+
return
|
|
480
|
+
|
|
401
481
|
els = current.card["gpu_profile"]
|
|
482
|
+
self._card_comps["error_component"] = None
|
|
483
|
+
els.clear()
|
|
484
|
+
|
|
485
|
+
current.card["gpu_profile"].append(
|
|
486
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
487
|
+
)
|
|
488
|
+
current.card["gpu_profile"].append(
|
|
489
|
+
Markdown(
|
|
490
|
+
"_Started at: %s_"
|
|
491
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
492
|
+
)
|
|
493
|
+
)
|
|
402
494
|
|
|
403
495
|
def _drivers():
|
|
404
496
|
els.append(Markdown("## Drivers"))
|
|
@@ -444,7 +536,7 @@ class GPUProfiler:
|
|
|
444
536
|
Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
|
|
445
537
|
)
|
|
446
538
|
els.append(
|
|
447
|
-
Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
|
|
539
|
+
Markdown(f"Detailed data saved in an artifact `{self.artifact_name}`")
|
|
448
540
|
)
|
|
449
541
|
return rows
|
|
450
542
|
|
|
@@ -479,20 +571,36 @@ class GPUProfiler:
|
|
|
479
571
|
_interconnect()
|
|
480
572
|
self._card_comps["max_utilization"] = _utilization()
|
|
481
573
|
self._card_comps["charts"] = _plots()
|
|
574
|
+
self._card_setup_finished = True
|
|
482
575
|
|
|
483
576
|
def _read_versions(self):
|
|
484
577
|
def parse(r, s):
|
|
485
578
|
return r.search(s).group(1).strip().decode("utf-8")
|
|
486
579
|
|
|
487
580
|
try:
|
|
488
|
-
|
|
581
|
+
result = subprocess.run(
|
|
582
|
+
["nvidia-smi"],
|
|
583
|
+
check=True, # This will raise a CalledProcessError if the command fails
|
|
584
|
+
stdout=subprocess.PIPE, # Capture stdout
|
|
585
|
+
stderr=subprocess.PIPE, # Capture stderr
|
|
586
|
+
)
|
|
587
|
+
# Access the standard output
|
|
588
|
+
out = result.stdout
|
|
489
589
|
return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
|
|
490
590
|
except FileNotFoundError:
|
|
491
591
|
return None, None, "nvidia-smi not found"
|
|
492
592
|
except AttributeError:
|
|
493
593
|
return None, None, "nvidia-smi output is unexpected"
|
|
494
|
-
except:
|
|
495
|
-
|
|
594
|
+
except subprocess.CalledProcessError as e:
|
|
595
|
+
_error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
|
|
596
|
+
if e.stderr is not None:
|
|
597
|
+
_error_message = (
|
|
598
|
+
"nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
|
|
599
|
+
% (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
|
|
600
|
+
)
|
|
601
|
+
return None, None, _error_message
|
|
602
|
+
except Exception as e:
|
|
603
|
+
return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
|
|
496
604
|
|
|
497
605
|
def _read_devices(self):
|
|
498
606
|
out = check_output(
|
|
@@ -572,39 +680,15 @@ class gpu_profile:
|
|
|
572
680
|
def __call__(self, f):
|
|
573
681
|
@wraps(f)
|
|
574
682
|
def func(s):
|
|
575
|
-
|
|
576
|
-
if self.include_artifacts:
|
|
577
|
-
setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
|
|
683
|
+
return f(s)
|
|
578
684
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
)
|
|
587
|
-
)
|
|
588
|
-
prof._setup_card(self.artifact_prefix + "data")
|
|
589
|
-
current.card["gpu_profile"].refresh()
|
|
590
|
-
update_thread = threading.Thread(target=prof._update_card, daemon=True)
|
|
591
|
-
update_thread.start()
|
|
592
|
-
|
|
593
|
-
try:
|
|
594
|
-
f(s)
|
|
595
|
-
finally:
|
|
596
|
-
try:
|
|
597
|
-
results = prof.finish()
|
|
598
|
-
except:
|
|
599
|
-
results = {"error": "couldn't read profiler results"}
|
|
600
|
-
if self.include_artifacts:
|
|
601
|
-
setattr(s, self.artifact_prefix + "data", results)
|
|
602
|
-
|
|
603
|
-
from metaflow import card
|
|
604
|
-
|
|
605
|
-
return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
|
|
606
|
-
func
|
|
607
|
-
)
|
|
685
|
+
from metaflow import gpu_profile
|
|
686
|
+
|
|
687
|
+
return gpu_profile(
|
|
688
|
+
include_artifacts=self.include_artifacts,
|
|
689
|
+
artifact_prefix=self.artifact_prefix,
|
|
690
|
+
interval=self.interval,
|
|
691
|
+
)(func)
|
|
608
692
|
|
|
609
693
|
|
|
610
694
|
def translate_to_vegalite(
|
|
@@ -8,10 +8,14 @@ import requests
|
|
|
8
8
|
from metaflow.exception import MetaflowException
|
|
9
9
|
from requests.models import HTTPError
|
|
10
10
|
from metaflow_extensions.outerbounds.plugins.perimeters import (
|
|
11
|
-
|
|
12
|
-
CURRENT_PERIMETER_URL,
|
|
11
|
+
get_perimeter_config_url_if_set_in_ob_config,
|
|
13
12
|
)
|
|
14
13
|
|
|
14
|
+
|
|
15
|
+
class OuterboundsConfigException(MetaflowException):
|
|
16
|
+
_OB_CONFIG_EXCEPTION = True
|
|
17
|
+
|
|
18
|
+
|
|
15
19
|
OBP_REMOTE_CONFIG_KEY = "OBP_METAFLOW_CONFIG_URL"
|
|
16
20
|
HOSTNAME_KEY = "OBP_API_SERVER"
|
|
17
21
|
AUTH_KEY = "METAFLOW_SERVICE_AUTH_KEY"
|
|
@@ -32,7 +36,7 @@ def read_config_from_local() -> Optional[Path]:
|
|
|
32
36
|
|
|
33
37
|
# we should error because the user wants a specific config
|
|
34
38
|
if profile:
|
|
35
|
-
raise
|
|
39
|
+
raise OuterboundsConfigException(
|
|
36
40
|
f"Unable to locate METAFLOW_PROFILE {profile} in {config_path}"
|
|
37
41
|
)
|
|
38
42
|
|
|
@@ -55,11 +59,24 @@ def resolve_config_from_remote(remote_url: str, auth_token: str) -> Dict[str, st
|
|
|
55
59
|
response.raise_for_status()
|
|
56
60
|
data = response.json()
|
|
57
61
|
return data["config"]
|
|
58
|
-
except HTTPError:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
62
|
+
except HTTPError as e:
|
|
63
|
+
if e.response.status_code >= 500:
|
|
64
|
+
raise OuterboundsConfigException(
|
|
65
|
+
"Error resolving outerbounds configuration [status:%s]. Please reach out to "
|
|
66
|
+
"the outerbounds support team to help resolve this issue."
|
|
67
|
+
% e.response.status_code
|
|
68
|
+
)
|
|
69
|
+
elif e.response.status_code == 403:
|
|
70
|
+
raise OuterboundsConfigException(
|
|
71
|
+
"Outerbounds token validity expired [status:%s]. Please re-run the `outerbounds configure` "
|
|
72
|
+
"command with a new magic string from the UI." % e.response.status_code
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
raise OuterboundsConfigException(
|
|
76
|
+
"Failed to fetch the outerbounds configuration string [status:%s]. Please reach out to "
|
|
77
|
+
"the outerbounds support team to help resolve this issue."
|
|
78
|
+
% e.response.status_code
|
|
79
|
+
)
|
|
63
80
|
|
|
64
81
|
|
|
65
82
|
def init_config() -> Dict[str, str]:
|
|
@@ -70,8 +87,8 @@ def init_config() -> Dict[str, str]:
|
|
|
70
87
|
because steps are executed in subprocesses (local) or environments which expect environment variables to be set.
|
|
71
88
|
"""
|
|
72
89
|
_init_debug("starting initialization")
|
|
73
|
-
|
|
74
|
-
if config_json
|
|
90
|
+
config_json = os.environ.get(CONFIG_READ_ONCE_KEY)
|
|
91
|
+
if config_json:
|
|
75
92
|
_init_debug("reading config from environment")
|
|
76
93
|
return json.loads(config_json)
|
|
77
94
|
|
|
@@ -82,18 +99,19 @@ def init_config() -> Dict[str, str]:
|
|
|
82
99
|
try:
|
|
83
100
|
remote_config = json.loads(config_path.read_text())
|
|
84
101
|
except ValueError:
|
|
85
|
-
raise
|
|
86
|
-
"Error decoding your metaflow config. Please run the `outerbounds configure`
|
|
87
|
-
|
|
102
|
+
raise OuterboundsConfigException(
|
|
103
|
+
"Error decoding your metaflow config. Please re-run the `outerbounds configure` "
|
|
104
|
+
"command with a new magic string from the UI."
|
|
88
105
|
)
|
|
89
106
|
|
|
107
|
+
perimeter_config_url = get_perimeter_config_url_if_set_in_ob_config()
|
|
108
|
+
if perimeter_config_url:
|
|
109
|
+
remote_config[OBP_REMOTE_CONFIG_KEY] = perimeter_config_url
|
|
110
|
+
|
|
90
111
|
# users still have a legacy format and that's ok.
|
|
91
112
|
if OBP_REMOTE_CONFIG_KEY not in remote_config:
|
|
92
113
|
return remote_config
|
|
93
114
|
|
|
94
|
-
if CURRENT_PERIMETER_KEY in os.environ and CURRENT_PERIMETER_URL in os.environ:
|
|
95
|
-
remote_config[OBP_REMOTE_CONFIG_KEY] = os.environ[CURRENT_PERIMETER_URL]
|
|
96
|
-
|
|
97
115
|
metaflow_config = resolve_config_from_remote(
|
|
98
116
|
remote_url=remote_config[OBP_REMOTE_CONFIG_KEY],
|
|
99
117
|
auth_token=remote_config[AUTH_KEY],
|
|
@@ -112,3 +130,22 @@ def _init_debug(*args, **kwargs):
|
|
|
112
130
|
init_str = "ob_extension_init:"
|
|
113
131
|
kwargs["file"] = sys.stderr
|
|
114
132
|
print(init_str, *args, **kwargs)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def reload_config():
|
|
136
|
+
"""
|
|
137
|
+
This function is used to reload the config. Currently its a best effort implementation
|
|
138
|
+
that will only reload auth token.
|
|
139
|
+
"""
|
|
140
|
+
_init_debug("reloading config")
|
|
141
|
+
if CONFIG_READ_ONCE_KEY in os.environ:
|
|
142
|
+
del os.environ[CONFIG_READ_ONCE_KEY]
|
|
143
|
+
config = init_config()
|
|
144
|
+
import metaflow.metaflow_config
|
|
145
|
+
|
|
146
|
+
metaflow.metaflow_config.SERVICE_AUTH_KEY = config.get(AUTH_KEY)
|
|
147
|
+
metaflow.metaflow_config.SERVICE_HEADERS["x-api-key"] = config.get(AUTH_KEY)
|
|
148
|
+
if config:
|
|
149
|
+
_init_debug("reloaded config")
|
|
150
|
+
else:
|
|
151
|
+
_init_debug("no config to reload")
|
|
@@ -5,6 +5,142 @@
|
|
|
5
5
|
__version__ = "v1"
|
|
6
6
|
__mf_extensions__ = "ob"
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
from metaflow_extensions.outerbounds.toplevel.s3_proxy import (
|
|
9
|
+
get_aws_client_with_s3_proxy,
|
|
10
|
+
get_S3_with_s3_proxy,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
_S3_PROXY_CONFIG = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def set_s3_proxy_config(config):
|
|
17
|
+
global _S3_PROXY_CONFIG
|
|
18
|
+
_S3_PROXY_CONFIG = config
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def clear_s3_proxy_config():
|
|
22
|
+
global _S3_PROXY_CONFIG
|
|
23
|
+
_S3_PROXY_CONFIG = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_s3_proxy_config():
|
|
27
|
+
global _S3_PROXY_CONFIG
|
|
28
|
+
if _S3_PROXY_CONFIG is None:
|
|
29
|
+
set_s3_proxy_config(get_s3_proxy_config_from_env())
|
|
30
|
+
return _S3_PROXY_CONFIG
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# TODO: Refactor out the _S3_PROXY_CONFIG global variable and instead use the function that
|
|
34
|
+
# extracts it from the environment variables.
|
|
35
|
+
|
|
36
|
+
import os
|
|
37
|
+
import json
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_s3_proxy_config_from_env():
|
|
41
|
+
env_conf = os.environ.get("METAFLOW_S3_PROXY_USER_CODE_CONFIG")
|
|
42
|
+
if env_conf:
|
|
43
|
+
return json.loads(env_conf)
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Must match the signature of metaflow.plugins.aws.aws_client.get_aws_client
|
|
48
|
+
# This function is called by the "userland" code inside tasks. Metaflow internals
|
|
49
|
+
# will call the function in metaflow.plugins.aws.aws_client.get_aws_client directly.
|
|
50
|
+
#
|
|
51
|
+
# Unlike the original function, this wrapper will use the CSPR role if both of the following
|
|
52
|
+
# conditions are met:
|
|
53
|
+
#
|
|
54
|
+
# 1. CSPR is set
|
|
55
|
+
# 2. user didn't provide a role to assume explicitly.
|
|
56
|
+
#
|
|
57
|
+
def get_aws_client(
|
|
58
|
+
module, with_error=False, role_arn=None, session_vars=None, client_params=None
|
|
59
|
+
):
|
|
60
|
+
import metaflow.plugins.aws.aws_client
|
|
61
|
+
import os
|
|
62
|
+
|
|
63
|
+
from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
|
|
64
|
+
from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
|
|
65
|
+
OBP_ASSUME_ROLE_ARN_ENV_VAR,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Check if the assume_role decorator has set a role ARN via environment variable
|
|
69
|
+
# This takes precedence over CSPR but not over explicitly passed role_arn
|
|
70
|
+
if role_arn is None:
|
|
71
|
+
decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
|
|
72
|
+
if decorator_role_arn:
|
|
73
|
+
role_arn = decorator_role_arn
|
|
74
|
+
|
|
75
|
+
if module == "s3" and get_s3_proxy_config() is not None:
|
|
76
|
+
return get_aws_client_with_s3_proxy(
|
|
77
|
+
module,
|
|
78
|
+
with_error,
|
|
79
|
+
role_arn,
|
|
80
|
+
session_vars,
|
|
81
|
+
client_params,
|
|
82
|
+
get_s3_proxy_config(),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
client = metaflow.plugins.aws.aws_client.get_aws_client(
|
|
86
|
+
module,
|
|
87
|
+
with_error=with_error,
|
|
88
|
+
role_arn=role_arn or USE_CSPR_ROLE_ARN_IF_SET,
|
|
89
|
+
session_vars=session_vars,
|
|
90
|
+
client_params=client_params,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return client
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# This should match the signature of metaflow.plugins.datatools.s3.S3.
|
|
97
|
+
#
|
|
98
|
+
# This assumes that "userland" code inside tasks will call this, while Metaflow
|
|
99
|
+
# internals will call metaflow.plugins.datatools.s3.S3 directly.
|
|
100
|
+
#
|
|
101
|
+
# This wrapper will make S3() use the CSPR role if its set, and user didn't provide
|
|
102
|
+
# a role to assume explicitly.
|
|
103
|
+
def S3(*args, **kwargs):
|
|
104
|
+
import sys
|
|
105
|
+
import metaflow.plugins.datatools.s3
|
|
106
|
+
import os
|
|
107
|
+
from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
|
|
108
|
+
from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
|
|
109
|
+
OBP_ASSUME_ROLE_ARN_ENV_VAR,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Check if the assume_role decorator has set a role ARN via environment variable
|
|
113
|
+
# This takes precedence over CSPR but not over explicitly passed role
|
|
114
|
+
if "role" not in kwargs or kwargs["role"] is None:
|
|
115
|
+
decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
|
|
116
|
+
if decorator_role_arn:
|
|
117
|
+
kwargs["role"] = decorator_role_arn
|
|
118
|
+
else:
|
|
119
|
+
kwargs["role"] = USE_CSPR_ROLE_ARN_IF_SET
|
|
120
|
+
|
|
121
|
+
# Check if S3 proxy is active using module variable (like CSPR)
|
|
122
|
+
if get_s3_proxy_config() is not None:
|
|
123
|
+
return get_S3_with_s3_proxy(get_s3_proxy_config(), *args, **kwargs)
|
|
124
|
+
|
|
125
|
+
return metaflow.plugins.datatools.s3.S3(*args, **kwargs)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Setting the S3 client docstring in order to ensure that
|
|
129
|
+
# stubs get generated properly.
|
|
130
|
+
import metaflow.plugins.datatools.s3
|
|
131
|
+
|
|
132
|
+
S3.__doc__ = metaflow.plugins.datatools.s3.S3.__doc__
|
|
133
|
+
|
|
10
134
|
from .. import profilers
|
|
135
|
+
from ..plugins.snowflake import Snowflake
|
|
136
|
+
from ..plugins.checkpoint_datastores import nebius_checkpoints, coreweave_checkpoints
|
|
137
|
+
from ..plugins.aws import assume_role
|
|
138
|
+
from . import ob_internal
|
|
139
|
+
from ..plugins.apps.core import (
|
|
140
|
+
AppDeployer,
|
|
141
|
+
DeployedApp,
|
|
142
|
+
bake_image,
|
|
143
|
+
BakedImage,
|
|
144
|
+
package_code,
|
|
145
|
+
PackagedCode,
|
|
146
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.ollama"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.optuna"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.snowflake"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.torchtune"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.vllm"]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from metaflow_extensions.outerbounds.plugins import USE_CSPR_ROLE_ARN_IF_SET
|
|
2
|
+
from metaflow.metaflow_config import AWS_SECRETS_MANAGER_DEFAULT_REGION
|
|
3
|
+
from metaflow_extensions.outerbounds.plugins.s3_proxy.constants import (
|
|
4
|
+
DEFAULT_PROXY_HOST,
|
|
5
|
+
DEFAULT_PROXY_PORT,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_aws_client_with_s3_proxy(
|
|
10
|
+
module,
|
|
11
|
+
with_error=False,
|
|
12
|
+
role_arn=None,
|
|
13
|
+
session_vars=None,
|
|
14
|
+
client_params=None,
|
|
15
|
+
s3_config=None,
|
|
16
|
+
):
|
|
17
|
+
if not client_params:
|
|
18
|
+
client_params = {}
|
|
19
|
+
|
|
20
|
+
client_params["region_name"] = client_params.get(
|
|
21
|
+
"region_name", s3_config.get("region")
|
|
22
|
+
)
|
|
23
|
+
client_params["endpoint_url"] = s3_config.get(
|
|
24
|
+
"endpoint_url", f"http://{DEFAULT_PROXY_HOST}:{DEFAULT_PROXY_PORT}"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
import metaflow.plugins.aws.aws_client
|
|
28
|
+
|
|
29
|
+
client = metaflow.plugins.aws.aws_client.get_aws_client(
|
|
30
|
+
module,
|
|
31
|
+
with_error=with_error,
|
|
32
|
+
role_arn=role_arn or USE_CSPR_ROLE_ARN_IF_SET,
|
|
33
|
+
session_vars=session_vars,
|
|
34
|
+
client_params=client_params,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def override_s3_proxy_host_header(request, **kwargs):
|
|
38
|
+
region = kwargs["region_name"]
|
|
39
|
+
request.headers["Host"] = f"s3.{region}.amazonaws.com"
|
|
40
|
+
if "x-ob-write-to" not in request.headers and "write_mode" in s3_config:
|
|
41
|
+
request.headers["x-ob-write-to"] = s3_config.get("write_mode")
|
|
42
|
+
|
|
43
|
+
client.meta.events.register("before-sign", override_s3_proxy_host_header)
|
|
44
|
+
|
|
45
|
+
return client
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_S3_with_s3_proxy(s3_config, *args, **kwargs):
|
|
49
|
+
if "region_name" not in kwargs:
|
|
50
|
+
kwargs["region_name"] = s3_config.get(
|
|
51
|
+
"region", AWS_SECRETS_MANAGER_DEFAULT_REGION
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
kwargs["endpoint_url"] = s3_config.get(
|
|
55
|
+
"endpoint_url", f"http://{DEFAULT_PROXY_HOST}:{DEFAULT_PROXY_PORT}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
import metaflow.plugins.datatools.s3
|
|
59
|
+
|
|
60
|
+
mf_s3 = metaflow.plugins.datatools.s3.S3(*args, **kwargs)
|
|
61
|
+
|
|
62
|
+
# Override reset_client to ensure proxy endpoint is preserved
|
|
63
|
+
original_reset_client = mf_s3._s3_client.reset_client
|
|
64
|
+
|
|
65
|
+
def proxy_reset_client():
|
|
66
|
+
original_reset_client()
|
|
67
|
+
import boto3
|
|
68
|
+
|
|
69
|
+
proxy_client = boto3.client(
|
|
70
|
+
"s3",
|
|
71
|
+
region_name=kwargs.get("region_name", s3_config.get("region")),
|
|
72
|
+
endpoint_url=s3_config.get("endpoint_url"),
|
|
73
|
+
)
|
|
74
|
+
mf_s3._s3_client._s3_client = proxy_client
|
|
75
|
+
|
|
76
|
+
mf_s3._s3_client.reset_client = proxy_reset_client
|
|
77
|
+
mf_s3._s3_client.reset_client()
|
|
78
|
+
|
|
79
|
+
def override_s3_proxy_host_header(request, **kwargs):
|
|
80
|
+
region = kwargs["region_name"]
|
|
81
|
+
request.headers["Host"] = f"s3.{region}.amazonaws.com"
|
|
82
|
+
if "x-ob-write-to" not in request.headers and "write_mode" in s3_config:
|
|
83
|
+
request.headers["x-ob-write-to"] = s3_config.get("write_mode")
|
|
84
|
+
|
|
85
|
+
mf_s3._s3_client._s3_client.meta.events.register(
|
|
86
|
+
"before-sign", override_s3_proxy_host_header
|
|
87
|
+
)
|
|
88
|
+
return mf_s3
|
{ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ob-metaflow-extensions
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.5.1
|
|
4
4
|
Summary: Outerbounds Platform Extensions for Metaflow
|
|
5
5
|
Author: Outerbounds, Inc.
|
|
6
6
|
License: Commercial
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
Requires-Dist: boto3
|
|
9
9
|
Requires-Dist: kubernetes
|
|
10
|
-
Requires-Dist: ob-metaflow (==2.
|
|
10
|
+
Requires-Dist: ob-metaflow (==2.19.15.3)
|
|
11
11
|
|
|
12
12
|
# Outerbounds platform package
|
|
13
13
|
|