ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.6.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +24 -3
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +16 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +333 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +1029 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +1300 -0
- metaflow_extensions/outerbounds/plugins/apps/core/exceptions.py +341 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +123 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +119 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +49 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +37 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +46 -9
- metaflow_extensions/outerbounds/toplevel/apps/__init__.py +9 -0
- metaflow_extensions/outerbounds/toplevel/apps/exceptions.py +11 -0
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.6.2.dist-info/RECORD +136 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import requests
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from .utils import safe_requests_wrapper, TODOException
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OuterboundsSecretsException(Exception):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SecretNotFound(OuterboundsSecretsException):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OuterboundsSecretsApiResponse:
|
|
22
|
+
def __init__(self, response):
|
|
23
|
+
self.response = response
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def secret_resource_id(self):
|
|
27
|
+
return self.response["secret_resource_id"]
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def secret_backend_type(self):
|
|
31
|
+
return self.response["secret_backend_type"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SecretRetriever:
|
|
35
|
+
def get_secret_as_dict(self, secret_id, options={}, role=None):
|
|
36
|
+
"""
|
|
37
|
+
Supports a special way of specifying secrets sources in outerbounds using the format:
|
|
38
|
+
@secrets(sources=["outerbounds.<integrations_name>"])
|
|
39
|
+
|
|
40
|
+
When invoked it makes a requests to the integrations secrets metadata endpoint on the
|
|
41
|
+
keywest server to get the cloud resource id for a secret. It then uses that to invoke
|
|
42
|
+
secrets manager on the core oss and returns the secrets.
|
|
43
|
+
"""
|
|
44
|
+
headers = {"Content-Type": "application/json", "Connection": "keep-alive"}
|
|
45
|
+
perimeter, integrations_url = self._get_secret_configs()
|
|
46
|
+
integration_name = secret_id
|
|
47
|
+
request_payload = {
|
|
48
|
+
"perimeter_name": perimeter,
|
|
49
|
+
"integration_name": integration_name,
|
|
50
|
+
}
|
|
51
|
+
response = self._make_request(integrations_url, headers, request_payload)
|
|
52
|
+
secret_resource_id = response.secret_resource_id
|
|
53
|
+
secret_backend_type = response.secret_backend_type
|
|
54
|
+
|
|
55
|
+
from metaflow.plugins.secrets.secrets_decorator import (
|
|
56
|
+
get_secrets_backend_provider,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
secrets_provider = get_secrets_backend_provider(secret_backend_type)
|
|
60
|
+
secret_dict = secrets_provider.get_secret_as_dict(
|
|
61
|
+
secret_resource_id, options={}, role=role
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Outerbounds stores secrets as binaries. Hence we expect the returned secret to be
|
|
65
|
+
# {<cloud-secret-name>: <base64 encoded full secret>}. We decode the secret here like:
|
|
66
|
+
# 1. decode the base64 encoded full secret
|
|
67
|
+
# 2. load the decoded secret as a json
|
|
68
|
+
# 3. decode the base64 encoded values in the dict
|
|
69
|
+
# 4. return the decoded dict
|
|
70
|
+
binary_secret = next(iter(secret_dict.values()))
|
|
71
|
+
return self._decode_secret(binary_secret)
|
|
72
|
+
|
|
73
|
+
def _is_base64_encoded(self, data):
|
|
74
|
+
try:
|
|
75
|
+
if isinstance(data, str):
|
|
76
|
+
# Check if the string can be base64 decoded
|
|
77
|
+
base64.b64decode(data).decode("utf-8")
|
|
78
|
+
return True
|
|
79
|
+
return False
|
|
80
|
+
except Exception:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
def _decode_secret(self, secret):
|
|
84
|
+
try:
|
|
85
|
+
result = {}
|
|
86
|
+
secret_str = secret
|
|
87
|
+
if self._is_base64_encoded(secret):
|
|
88
|
+
# we check if the secret string is base64 encoded because the returned secret from
|
|
89
|
+
# AWS secret manager is base64 encoded while the secret from GCP is not
|
|
90
|
+
secret_str = base64.b64decode(secret).decode("utf-8")
|
|
91
|
+
|
|
92
|
+
secret_dict = json.loads(secret_str)
|
|
93
|
+
for key, value in secret_dict.items():
|
|
94
|
+
result[key] = base64.b64decode(value).decode("utf-8")
|
|
95
|
+
|
|
96
|
+
return result
|
|
97
|
+
except Exception as e:
|
|
98
|
+
raise OuterboundsSecretsException(f"Error decoding secret: {e}")
|
|
99
|
+
|
|
100
|
+
def _get_secret_configs(self):
|
|
101
|
+
from metaflow_extensions.outerbounds.remote_config import init_config # type: ignore
|
|
102
|
+
from os import environ
|
|
103
|
+
|
|
104
|
+
conf = init_config()
|
|
105
|
+
if "OBP_PERIMETER" in conf:
|
|
106
|
+
perimeter = conf["OBP_PERIMETER"]
|
|
107
|
+
else:
|
|
108
|
+
# if the perimeter is not in metaflow config, try to get it from the environment
|
|
109
|
+
perimeter = environ.get("OBP_PERIMETER", "")
|
|
110
|
+
|
|
111
|
+
if "OBP_INTEGRATIONS_URL" in conf:
|
|
112
|
+
integrations_url = conf["OBP_INTEGRATIONS_URL"]
|
|
113
|
+
else:
|
|
114
|
+
# if the integrations is not in metaflow config, try to get it from the environment
|
|
115
|
+
integrations_url = environ.get("OBP_INTEGRATIONS_URL", "")
|
|
116
|
+
|
|
117
|
+
if not perimeter:
|
|
118
|
+
raise OuterboundsSecretsException(
|
|
119
|
+
"No perimeter set. Please make sure to run `outerbounds configure <...>` command which can be found on the Outerbounds UI or reach out to your Outerbounds support team."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if not integrations_url:
|
|
123
|
+
raise OuterboundsSecretsException(
|
|
124
|
+
"No integrations url set. Please notify your Outerbounds support team about this issue."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
integrations_secrets_metadata_url = f"{integrations_url}/secrets/metadata"
|
|
128
|
+
return perimeter, integrations_secrets_metadata_url
|
|
129
|
+
|
|
130
|
+
def _make_request(self, url, headers: Dict, payload: Dict):
|
|
131
|
+
try:
|
|
132
|
+
from metaflow.metaflow_config import SERVICE_HEADERS
|
|
133
|
+
|
|
134
|
+
request_headers = {**headers, **(SERVICE_HEADERS or {})}
|
|
135
|
+
except ImportError:
|
|
136
|
+
raise OuterboundsSecretsException(
|
|
137
|
+
"Failed to create app: No Metaflow service headers found"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
response = safe_requests_wrapper(
|
|
141
|
+
requests.get,
|
|
142
|
+
url,
|
|
143
|
+
data=json.dumps(payload),
|
|
144
|
+
headers=request_headers,
|
|
145
|
+
conn_error_retries=5,
|
|
146
|
+
retryable_status_codes=[409],
|
|
147
|
+
)
|
|
148
|
+
self._handle_error_response(response)
|
|
149
|
+
return OuterboundsSecretsApiResponse(response.json())
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def _handle_error_response(response: requests.Response):
|
|
153
|
+
if response.status_code >= 500:
|
|
154
|
+
raise OuterboundsSecretsException(
|
|
155
|
+
f"Server error: {response.text}. Please reach out to your Outerbounds support team."
|
|
156
|
+
)
|
|
157
|
+
status_code = response.status_code
|
|
158
|
+
if status_code == 404:
|
|
159
|
+
raise SecretNotFound(f"Secret not found: {response.text}")
|
|
160
|
+
|
|
161
|
+
if status_code >= 400:
|
|
162
|
+
raise OuterboundsSecretsException(
|
|
163
|
+
f"status_code={status_code}\t\n\t\t{response.text}"
|
|
164
|
+
)
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import time
|
|
3
|
+
import sys
|
|
4
|
+
import json
|
|
5
|
+
import requests
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
# This click import is not used to construct any ob
|
|
9
|
+
# package cli. Its used only for printing stuff.
|
|
10
|
+
# So we can use the static metaflow._vendor import path
|
|
11
|
+
from metaflow._vendor import click
|
|
12
|
+
from .app_config import CAPSULE_DEBUG
|
|
13
|
+
import sys
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
import logging
|
|
17
|
+
import itertools
|
|
18
|
+
from typing import Union, Callable, Any, List
|
|
19
|
+
|
|
20
|
+
from ._vendor.spinner import (
|
|
21
|
+
Spinners,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MultiStepSpinner:
|
|
26
|
+
"""
|
|
27
|
+
A spinner that supports multi-step progress and configurable alignment.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
spinner : Spinners
|
|
32
|
+
Which spinner frames/interval to use.
|
|
33
|
+
text : str
|
|
34
|
+
Static text to display beside the spinner.
|
|
35
|
+
color : str, optional
|
|
36
|
+
Click color name.
|
|
37
|
+
align : {'left','right'}
|
|
38
|
+
Whether to render the spinner to the left (default) or right of the text.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
spinner: Spinners = Spinners.dots,
|
|
44
|
+
text: str = "",
|
|
45
|
+
color: Optional[str] = None,
|
|
46
|
+
align: str = "right",
|
|
47
|
+
file=sys.stdout,
|
|
48
|
+
):
|
|
49
|
+
cfg = spinner.value
|
|
50
|
+
self.frames = cfg["frames"]
|
|
51
|
+
self.interval = float(cfg["interval"]) / 1000.0 # type: ignore
|
|
52
|
+
self.text = text
|
|
53
|
+
self.color = color
|
|
54
|
+
if align not in ("left", "right"):
|
|
55
|
+
raise ValueError("align must be 'left' or 'right'")
|
|
56
|
+
self.align = align
|
|
57
|
+
self._write_file = file
|
|
58
|
+
# precompute clear length: max frame width + space + text length
|
|
59
|
+
max_frame = max(self.frames, key=lambda x: len(x)) # type: ignore
|
|
60
|
+
self.clear_len = len(self.main_text) + len(max_frame) + 1
|
|
61
|
+
|
|
62
|
+
self._stop_evt = threading.Event()
|
|
63
|
+
self._pause_evt = threading.Event()
|
|
64
|
+
self._thread = None
|
|
65
|
+
self._write_lock = threading.Lock()
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def main_text(self):
|
|
69
|
+
# if self.text is a callable then call it
|
|
70
|
+
if callable(self.text):
|
|
71
|
+
return self.text()
|
|
72
|
+
return self.text
|
|
73
|
+
|
|
74
|
+
def _spin(self):
|
|
75
|
+
for frame in itertools.cycle(self.frames):
|
|
76
|
+
if self._stop_evt.is_set():
|
|
77
|
+
break
|
|
78
|
+
if self._pause_evt.is_set():
|
|
79
|
+
time.sleep(0.05)
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# ---- Core logging critical section ----
|
|
83
|
+
with self._write_lock:
|
|
84
|
+
symbol = click.style(frame, fg=self.color) if self.color else frame
|
|
85
|
+
if self.align == "left":
|
|
86
|
+
msg = f"{symbol} {self.main_text}"
|
|
87
|
+
else:
|
|
88
|
+
msg = f"{self.main_text} {symbol}"
|
|
89
|
+
|
|
90
|
+
click.echo(msg, nl=False, file=self._write_file)
|
|
91
|
+
click.echo("\r", nl=False, file=self._write_file)
|
|
92
|
+
self._write_file.flush()
|
|
93
|
+
# ---- End of critical section ----
|
|
94
|
+
time.sleep(self.interval)
|
|
95
|
+
# clear the line when done
|
|
96
|
+
self._clear_line()
|
|
97
|
+
|
|
98
|
+
def _clear_line(self):
|
|
99
|
+
with self._write_lock:
|
|
100
|
+
click.echo(" " * self.clear_len, nl=False, file=self._write_file)
|
|
101
|
+
click.echo("\r", nl=False, file=self._write_file)
|
|
102
|
+
self._write_file.flush()
|
|
103
|
+
|
|
104
|
+
def start(self):
|
|
105
|
+
if self._thread and self._thread.is_alive():
|
|
106
|
+
return
|
|
107
|
+
self._stop_evt.clear()
|
|
108
|
+
self._pause_evt.clear()
|
|
109
|
+
self._thread = threading.Thread(target=self._spin, daemon=True)
|
|
110
|
+
self._thread.start()
|
|
111
|
+
|
|
112
|
+
def stop(self):
|
|
113
|
+
self._stop_evt.set()
|
|
114
|
+
if self._thread:
|
|
115
|
+
self._thread.join()
|
|
116
|
+
|
|
117
|
+
def log(self, *messages: str):
|
|
118
|
+
"""Pause the spinner, emit a ✔ + message, then resume."""
|
|
119
|
+
self._pause_evt.set()
|
|
120
|
+
self._clear_line()
|
|
121
|
+
# ---- Core logging critical section ----
|
|
122
|
+
with self._write_lock:
|
|
123
|
+
self._write_file.flush()
|
|
124
|
+
for message in messages:
|
|
125
|
+
click.echo(f"{message}", file=self._write_file, nl=True)
|
|
126
|
+
self._write_file.flush()
|
|
127
|
+
# ---- End of critical section ----
|
|
128
|
+
self._pause_evt.clear()
|
|
129
|
+
|
|
130
|
+
def __enter__(self):
|
|
131
|
+
self.start()
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def __exit__(self, exc_type, exc, tb):
|
|
135
|
+
self.stop()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class SpinnerLogHandler(logging.Handler):
|
|
139
|
+
def __init__(self, spinner: MultiStepSpinner, *args, **kwargs):
|
|
140
|
+
super().__init__(*args, **kwargs)
|
|
141
|
+
self.spinner = spinner
|
|
142
|
+
|
|
143
|
+
def emit(self, record):
|
|
144
|
+
msg = self.format(record)
|
|
145
|
+
self.spinner.log(msg)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class MaximumRetriesExceeded(Exception):
|
|
149
|
+
def __init__(self, url, method, status_code, text):
|
|
150
|
+
self.url = url
|
|
151
|
+
self.method = method
|
|
152
|
+
self.status_code = status_code
|
|
153
|
+
self.text = text
|
|
154
|
+
|
|
155
|
+
def __str__(self):
|
|
156
|
+
return f"Maximum retries exceeded for {self.url}[{self.method}] {self.status_code} {self.text}"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TODOException(Exception):
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
requests_funcs = [
|
|
164
|
+
requests.get,
|
|
165
|
+
requests.post,
|
|
166
|
+
requests.put,
|
|
167
|
+
requests.delete,
|
|
168
|
+
requests.patch,
|
|
169
|
+
requests.head,
|
|
170
|
+
requests.options,
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def safe_requests_wrapper(
|
|
175
|
+
requests_module_fn,
|
|
176
|
+
*args,
|
|
177
|
+
conn_error_retries=2,
|
|
178
|
+
retryable_status_codes=[409],
|
|
179
|
+
logger_fn=None,
|
|
180
|
+
**kwargs,
|
|
181
|
+
):
|
|
182
|
+
"""
|
|
183
|
+
There are two categories of errors that we need to handle when dealing with any API server.
|
|
184
|
+
1. HTTP errors. These are are errors that are returned from the API server.
|
|
185
|
+
- How to handle retries for this case will be application specific.
|
|
186
|
+
2. Errors when the API server may not be reachable (DNS resolution / network issues)
|
|
187
|
+
- In this scenario, we know that something external to the API server is going wrong causing the issue.
|
|
188
|
+
- Failing prematurely in the case might not be the best course of action since critical user jobs might crash on intermittent issues.
|
|
189
|
+
- So in this case, we can just plainly retry the request.
|
|
190
|
+
|
|
191
|
+
This function handles the second case. It's a simple wrapper to handle the retry logic for connection errors.
|
|
192
|
+
If this function is provided a `conn_error_retries` of 5, then the last retry will have waited 32 seconds.
|
|
193
|
+
Generally this is a safe enough number of retries after which we can assume that something is really broken. Until then,
|
|
194
|
+
there can be intermittent issues that would resolve themselves if we retry gracefully.
|
|
195
|
+
"""
|
|
196
|
+
if requests_module_fn not in requests_funcs:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"safe_requests_wrapper doesn't support {requests_module_fn.__name__}. You can only use the following functions: {requests_funcs}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
_num_retries = 0
|
|
202
|
+
noise = random.uniform(-0.5, 0.5)
|
|
203
|
+
response = None
|
|
204
|
+
while _num_retries < conn_error_retries:
|
|
205
|
+
try:
|
|
206
|
+
response = requests_module_fn(*args, **kwargs)
|
|
207
|
+
if response.status_code not in retryable_status_codes:
|
|
208
|
+
return response
|
|
209
|
+
if CAPSULE_DEBUG:
|
|
210
|
+
if logger_fn:
|
|
211
|
+
logger_fn(
|
|
212
|
+
f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
print(
|
|
216
|
+
f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
|
|
217
|
+
file=sys.stderr,
|
|
218
|
+
)
|
|
219
|
+
_num_retries += 1
|
|
220
|
+
time.sleep((2 ** (_num_retries + 1)) + noise)
|
|
221
|
+
except requests.exceptions.ConnectionError:
|
|
222
|
+
if _num_retries <= conn_error_retries - 1:
|
|
223
|
+
# Exponential backoff with 2^(_num_retries+1) seconds
|
|
224
|
+
time.sleep((2 ** (_num_retries + 1)) + noise)
|
|
225
|
+
_num_retries += 1
|
|
226
|
+
else:
|
|
227
|
+
raise
|
|
228
|
+
raise MaximumRetriesExceeded(
|
|
229
|
+
response.url,
|
|
230
|
+
requests_module_fn.__name__,
|
|
231
|
+
response.status_code,
|
|
232
|
+
response.text,
|
|
233
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
from .app_config import AppConfig, AppConfigError
|
|
4
|
+
from .secrets import SecretRetriever, SecretNotFound
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def secrets_validator(secrets: List[str]):
|
|
8
|
+
secret_retriever = SecretRetriever()
|
|
9
|
+
for secret in secrets:
|
|
10
|
+
try:
|
|
11
|
+
secret_retriever.get_secret_as_dict(secret)
|
|
12
|
+
except SecretNotFound:
|
|
13
|
+
raise Exception(f"Secret named `{secret}` not found")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_validations(app_config: AppConfig):
|
|
17
|
+
pass
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
2
|
+
from metaflow.user_decorators.mutable_step import MutableStep
|
|
3
|
+
from metaflow.user_decorators.user_flow_decorator import FlowMutator
|
|
4
|
+
from .assume_role import OBP_ASSUME_ROLE_ARN_ENV_VAR
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class assume_role(FlowMutator):
|
|
8
|
+
"""
|
|
9
|
+
Flow-level decorator for assuming AWS IAM roles.
|
|
10
|
+
|
|
11
|
+
When applied to a flow, all steps in the flow will automatically use the specified IAM role-arn
|
|
12
|
+
as their source principal.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
------
|
|
16
|
+
@assume_role(role_arn="arn:aws:iam::123456789012:role/my-iam-role")
|
|
17
|
+
class MyFlow(FlowSpec):
|
|
18
|
+
@step
|
|
19
|
+
def start(self):
|
|
20
|
+
import boto3
|
|
21
|
+
client = boto3.client("dynamodb") # Automatically uses the role in the flow decorator
|
|
22
|
+
self.next(self.end)
|
|
23
|
+
|
|
24
|
+
@step
|
|
25
|
+
def end(self):
|
|
26
|
+
from metaflow import get_aws_client
|
|
27
|
+
client = get_aws_client("dynamodb") # Automatically uses the role in the flow decorator
|
|
28
|
+
|
|
29
|
+
You can also filter which steps should use the role:
|
|
30
|
+
@assume_role(role_arn="arn:aws:iam::123456789012:role/my-iam-role", steps=["start", "process"])
|
|
31
|
+
class MyFlow(FlowSpec):
|
|
32
|
+
@step
|
|
33
|
+
def start(self):
|
|
34
|
+
# user code in this step will use the assumed role
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@step
|
|
38
|
+
def process(self):
|
|
39
|
+
# user code in this step will use the assumed role
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@step
|
|
43
|
+
def end(self):
|
|
44
|
+
# user code in this step will NOT use the assumed role
|
|
45
|
+
pass
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def init(self, *args, **kwargs):
|
|
49
|
+
self.role_arn = kwargs.get("role_arn", None)
|
|
50
|
+
self.steps = kwargs.get("steps", None)
|
|
51
|
+
|
|
52
|
+
if self.role_arn is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"`role_arn` keyword argument is required for the assume_role decorator"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if not self.role_arn.startswith("arn:aws:iam::"):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"`role_arn` must be a valid AWS IAM role ARN starting with 'arn:aws:iam::'"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Validate steps parameter
|
|
63
|
+
if self.steps is not None:
|
|
64
|
+
if not isinstance(self.steps, (list, tuple)):
|
|
65
|
+
raise ValueError("`steps` must be a list or tuple of step names")
|
|
66
|
+
if not all(isinstance(s, str) for s in self.steps):
|
|
67
|
+
raise ValueError("All step names in `steps` must be strings")
|
|
68
|
+
|
|
69
|
+
def pre_mutate(self, mutable_flow: MutableFlow) -> None:
|
|
70
|
+
"""
|
|
71
|
+
This method is called by Metaflow to apply the decorator to the flow.
|
|
72
|
+
It sets up environment variables that will be used by the AWS client
|
|
73
|
+
to automatically assume the specified role.
|
|
74
|
+
"""
|
|
75
|
+
# Import environment decorator at runtime to avoid circular imports
|
|
76
|
+
from metaflow import environment
|
|
77
|
+
|
|
78
|
+
# Validate that all specified steps exist in the flow
|
|
79
|
+
if self.steps is not None:
|
|
80
|
+
flow_step_names = {step_name for step_name, _ in mutable_flow.steps}
|
|
81
|
+
specified_steps = set(self.steps)
|
|
82
|
+
missing_steps = specified_steps - flow_step_names
|
|
83
|
+
|
|
84
|
+
if missing_steps:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Step(s) {sorted(missing_steps)} specified in `steps` parameter "
|
|
87
|
+
f"do not exist in the flow. Available steps: {sorted(flow_step_names)}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def _swap_environment_variables(step: MutableStep, role_arn: str) -> None:
|
|
91
|
+
_step_has_env_set = True
|
|
92
|
+
_env_kwargs = {OBP_ASSUME_ROLE_ARN_ENV_VAR: role_arn}
|
|
93
|
+
for d in step.decorator_specs:
|
|
94
|
+
name, _, _, deco_kwargs = d
|
|
95
|
+
if name == "environment":
|
|
96
|
+
_env_kwargs.update(deco_kwargs["vars"])
|
|
97
|
+
_step_has_env_set = True
|
|
98
|
+
|
|
99
|
+
if _step_has_env_set:
|
|
100
|
+
# remove the environment decorator
|
|
101
|
+
step.remove_decorator("environment")
|
|
102
|
+
|
|
103
|
+
# add the environment decorator
|
|
104
|
+
step.add_decorator(
|
|
105
|
+
environment,
|
|
106
|
+
deco_kwargs=dict(vars=_env_kwargs),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Set the role ARN as an environment variable that will be picked up
|
|
110
|
+
# by the get_aws_client function
|
|
111
|
+
def _setup_role_assumption(step: MutableStep) -> None:
|
|
112
|
+
_swap_environment_variables(step, self.role_arn)
|
|
113
|
+
|
|
114
|
+
# Apply the role assumption setup to all steps in the flow (or filtered steps)
|
|
115
|
+
for step_name, step in mutable_flow.steps:
|
|
116
|
+
# If steps filter is specified, only apply to those steps
|
|
117
|
+
if self.steps is None or step_name in self.steps:
|
|
118
|
+
_setup_role_assumption(step)
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from metaflow.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
)
|
|
1
|
+
from metaflow.user_decorators.user_flow_decorator import FlowMutator
|
|
2
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
3
|
+
from metaflow.user_decorators.mutable_step import MutableStep
|
|
4
|
+
from .external_chckpt import _ExternalCheckpointFlowDeco
|
|
6
5
|
import os
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
class coreweave_checkpoints(
|
|
8
|
+
class coreweave_checkpoints(_ExternalCheckpointFlowDeco):
|
|
10
9
|
|
|
11
10
|
"""
|
|
12
11
|
|
|
@@ -46,78 +45,14 @@ class coreweave_checkpoints(CustomFlowDecorator):
|
|
|
46
45
|
super().__init__(*args, **kwargs)
|
|
47
46
|
|
|
48
47
|
def init(self, *args, **kwargs):
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
self.secrets = kwargs.get("secrets", [])
|
|
52
|
-
if self.bucket_path is None:
|
|
53
|
-
raise ValueError(
|
|
54
|
-
"`bucket_path` keyword argument is required for the coreweave_datastore"
|
|
55
|
-
)
|
|
56
|
-
if not self.bucket_path.startswith("s3://"):
|
|
57
|
-
raise ValueError(
|
|
58
|
-
"`bucket_path` must start with `s3://` for the coreweave_datastore"
|
|
59
|
-
)
|
|
60
|
-
|
|
48
|
+
super().init(*args, **kwargs)
|
|
61
49
|
self.coreweave_endpoint_url = f"https://cwobject.com"
|
|
62
|
-
if self.secrets is None:
|
|
63
|
-
raise ValueError(
|
|
64
|
-
"`secrets` keyword argument is required for the coreweave_datastore"
|
|
65
|
-
)
|
|
66
50
|
|
|
67
|
-
def
|
|
51
|
+
def pre_mutate(self, mutable_flow: MutableFlow) -> None:
|
|
68
52
|
from metaflow import (
|
|
69
|
-
checkpoint,
|
|
70
|
-
model,
|
|
71
|
-
huggingface_hub,
|
|
72
|
-
secrets,
|
|
73
53
|
with_artifact_store,
|
|
74
54
|
)
|
|
75
55
|
|
|
76
|
-
def _add_secrets(step: MutableStep) -> None:
|
|
77
|
-
decos_to_add = []
|
|
78
|
-
swapping_decos = {
|
|
79
|
-
"huggingface_hub": huggingface_hub,
|
|
80
|
-
"model": model,
|
|
81
|
-
"checkpoint": checkpoint,
|
|
82
|
-
}
|
|
83
|
-
already_has_secrets = False
|
|
84
|
-
secrets_present_in_deco = []
|
|
85
|
-
for d in step.decorators:
|
|
86
|
-
if d.name in swapping_decos:
|
|
87
|
-
decos_to_add.append((d.name, d.attributes))
|
|
88
|
-
elif d.name == "secrets":
|
|
89
|
-
already_has_secrets = True
|
|
90
|
-
secrets_present_in_deco.extend(d.attributes["sources"])
|
|
91
|
-
|
|
92
|
-
# If the step aleady has secrets then take all the sources in
|
|
93
|
-
# the secrets and add the addtional secrets to the existing secrets
|
|
94
|
-
secrets_to_add = self.secrets
|
|
95
|
-
if already_has_secrets:
|
|
96
|
-
secrets_to_add.extend(secrets_present_in_deco)
|
|
97
|
-
|
|
98
|
-
secrets_to_add = list(set(secrets_to_add))
|
|
99
|
-
|
|
100
|
-
if len(decos_to_add) == 0:
|
|
101
|
-
if already_has_secrets:
|
|
102
|
-
step.remove_decorator("secrets")
|
|
103
|
-
|
|
104
|
-
step.add_decorator(
|
|
105
|
-
secrets,
|
|
106
|
-
sources=secrets_to_add,
|
|
107
|
-
)
|
|
108
|
-
return
|
|
109
|
-
|
|
110
|
-
for d, _ in decos_to_add:
|
|
111
|
-
step.remove_decorator(d)
|
|
112
|
-
|
|
113
|
-
step.add_decorator(
|
|
114
|
-
secrets,
|
|
115
|
-
sources=secrets_to_add,
|
|
116
|
-
)
|
|
117
|
-
for d, attrs in decos_to_add:
|
|
118
|
-
_deco_to_add = swapping_decos[d]
|
|
119
|
-
step.add_decorator(_deco_to_add, **attrs)
|
|
120
|
-
|
|
121
56
|
def _coreweave_config():
|
|
122
57
|
return {
|
|
123
58
|
"root": self.bucket_path,
|
|
@@ -131,9 +66,6 @@ class coreweave_checkpoints(CustomFlowDecorator):
|
|
|
131
66
|
|
|
132
67
|
mutable_flow.add_decorator(
|
|
133
68
|
with_artifact_store,
|
|
134
|
-
type="coreweave",
|
|
135
|
-
config=_coreweave_config,
|
|
69
|
+
deco_kwargs=dict(type="coreweave", config=_coreweave_config),
|
|
136
70
|
)
|
|
137
|
-
|
|
138
|
-
for step_name, step in mutable_flow.steps:
|
|
139
|
-
_add_secrets(step)
|
|
71
|
+
self._swap_secrets(mutable_flow)
|