ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -7
- metaflow_extensions/outerbounds/config/__init__.py +35 -0
- metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
- metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
- metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
- metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
- metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
- metaflow_extensions/outerbounds/remote_config.py +53 -16
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
import metaflow.metaflow_config_funcs
|
|
2
2
|
|
|
3
|
-
from metaflow_extensions.outerbounds.remote_config import init_config
|
|
4
|
-
|
|
5
|
-
from metaflow_extensions.outerbounds.plugins.perimeters import (
|
|
6
|
-
set_current_perimeter_config_url_in_environment,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
set_current_perimeter_config_url_in_environment()
|
|
3
|
+
from metaflow_extensions.outerbounds.remote_config import init_config, reload_config
|
|
10
4
|
|
|
11
5
|
# we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
|
|
12
6
|
# we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
|
|
@@ -1,5 +1,40 @@
|
|
|
1
|
+
from metaflow.metaflow_config import from_conf
|
|
2
|
+
|
|
1
3
|
DEFAULT_AWS_CLIENT_PROVIDER = "obp"
|
|
2
4
|
|
|
3
5
|
DEFAULT_AZURE_CLIENT_PROVIDER = "obp"
|
|
4
6
|
|
|
5
7
|
DEFAULT_GCP_CLIENT_PROVIDER = "obp"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
###
|
|
11
|
+
# On Demand Docker image build configuration
|
|
12
|
+
###
|
|
13
|
+
# Image builder service url
|
|
14
|
+
FAST_BAKERY_URL = from_conf("FAST_BAKERY_URL", None)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
###
|
|
18
|
+
# NVCF configuration
|
|
19
|
+
###
|
|
20
|
+
# Maximum number of consecutive heartbeats that can be missed.
|
|
21
|
+
NVIDIA_HEARTBEAT_THRESHOLD = from_conf("NVIDIA_HEARTBEAT_THRESHOLD", "3")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
###
|
|
25
|
+
# Snowpark configuration
|
|
26
|
+
###
|
|
27
|
+
# Snowflake account to use with the @snowpark decorator
|
|
28
|
+
SNOWPARK_ACCOUNT = from_conf("SNOWPARK_ACCOUNT")
|
|
29
|
+
# Snowflake user to use with the @snowpark decorator
|
|
30
|
+
SNOWPARK_USER = from_conf("SNOWPARK_USER")
|
|
31
|
+
# Snowflake password to use with the @snowpark decorator
|
|
32
|
+
SNOWPARK_PASSWORD = from_conf("SNOWPARK_PASSWORD")
|
|
33
|
+
# Snowflake role to use with the @snowpark decorator
|
|
34
|
+
SNOWPARK_ROLE = from_conf("SNOWPARK_ROLE")
|
|
35
|
+
# Snowflake database to use with the @snowpark decorator
|
|
36
|
+
SNOWPARK_DATABASE = from_conf("SNOWPARK_DATABASE")
|
|
37
|
+
# Snowflake warehouse to use with the @snowpark decorator
|
|
38
|
+
SNOWPARK_WAREHOUSE = from_conf("SNOWPARK_WAREHOUSE")
|
|
39
|
+
# Snowflake schema to use with the @snowpark decorator
|
|
40
|
+
SNOWPARK_SCHEMA = from_conf("SNOWPARK_SCHEMA")
|
|
@@ -32,6 +32,134 @@ def hide_access_keys(*args, **kwds):
|
|
|
32
32
|
os.environ["AWS_SESSION_TOKEN"] = AWS_SESSION_TOKEN
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# This is a special placeholder value that can be passed as role_arn to
|
|
36
|
+
# get_boto3_session() which makes it use the CSPR role, if its set.
|
|
37
|
+
USE_CSPR_ROLE_ARN_IF_SET = "__cspr__"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_boto3_session(role_arn=None, session_vars=None):
|
|
41
|
+
import boto3
|
|
42
|
+
import botocore
|
|
43
|
+
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
44
|
+
from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
|
|
45
|
+
OBP_ASSUME_ROLE_ARN_ENV_VAR,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
from hashlib import sha256
|
|
49
|
+
from metaflow.util import get_username
|
|
50
|
+
|
|
51
|
+
user = get_username()
|
|
52
|
+
|
|
53
|
+
token_info = get_token("/generate/aws")
|
|
54
|
+
|
|
55
|
+
# Write token to a file. The file name is derived from the user name
|
|
56
|
+
# so it works with multiple users on the same machine.
|
|
57
|
+
#
|
|
58
|
+
# We hash the user name so we don't have to deal with special characters
|
|
59
|
+
# in the file name and the file name is not exposed to the user
|
|
60
|
+
# anyways, so it doesn't matter that its a little ugly.
|
|
61
|
+
token_file = "/tmp/obp_token." + sha256(user.encode("utf-8")).hexdigest()[:16]
|
|
62
|
+
|
|
63
|
+
# Write to a temp file then rename to avoid a situation when someone
|
|
64
|
+
# tries to read the file after it was open for writing (and truncated)
|
|
65
|
+
# but before the token was written to it.
|
|
66
|
+
with tempfile.NamedTemporaryFile("w", delete=False) as f:
|
|
67
|
+
f.write(token_info["token"])
|
|
68
|
+
tmp_token_file = f.name
|
|
69
|
+
os.rename(tmp_token_file, token_file)
|
|
70
|
+
|
|
71
|
+
cspr_role = None
|
|
72
|
+
if token_info.get("cspr_role_arn"):
|
|
73
|
+
cspr_role = token_info["cspr_role_arn"]
|
|
74
|
+
|
|
75
|
+
# Check if the assume_role decorator has set a CSPR ARN via environment variable
|
|
76
|
+
# This takes precedence over CSPR role that comes from the token_info response
|
|
77
|
+
decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
|
|
78
|
+
if decorator_role_arn:
|
|
79
|
+
cspr_role = decorator_role_arn
|
|
80
|
+
|
|
81
|
+
if cspr_role:
|
|
82
|
+
# If CSPR role is set, we set it as the default role to assume
|
|
83
|
+
# for the AWS SDK. We do this by writing an AWS config file
|
|
84
|
+
# with two profiles. One to get credentials for the task role
|
|
85
|
+
# in exchange for the OIDC token, and second to assume the
|
|
86
|
+
# CSPR role using the task role credentials.
|
|
87
|
+
import configparser
|
|
88
|
+
from io import StringIO
|
|
89
|
+
|
|
90
|
+
aws_config = configparser.ConfigParser()
|
|
91
|
+
|
|
92
|
+
# Task role profile
|
|
93
|
+
aws_config["profile task"] = {
|
|
94
|
+
"role_arn": token_info["role_arn"],
|
|
95
|
+
"web_identity_token_file": token_file,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# CSPR role profile (default)
|
|
99
|
+
aws_config["profile cspr"] = {
|
|
100
|
+
"role_arn": cspr_role,
|
|
101
|
+
"source_profile": "task",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
aws_config_string = StringIO()
|
|
105
|
+
aws_config.write(aws_config_string)
|
|
106
|
+
aws_config_file = (
|
|
107
|
+
"/tmp/aws_config." + sha256(user.encode("utf-8")).hexdigest()[:16]
|
|
108
|
+
)
|
|
109
|
+
with tempfile.NamedTemporaryFile(
|
|
110
|
+
"w", delete=False, dir=os.path.dirname(aws_config_file)
|
|
111
|
+
) as f:
|
|
112
|
+
f.write(aws_config_string.getvalue())
|
|
113
|
+
tmp_aws_config_file = f.name
|
|
114
|
+
os.rename(tmp_aws_config_file, aws_config_file)
|
|
115
|
+
os.environ["AWS_CONFIG_FILE"] = aws_config_file
|
|
116
|
+
os.environ["AWS_PROFILE"] = "cspr"
|
|
117
|
+
else:
|
|
118
|
+
os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = token_file
|
|
119
|
+
os.environ["AWS_ROLE_ARN"] = token_info["role_arn"]
|
|
120
|
+
|
|
121
|
+
# Enable regional STS endpoints. This is the new recommended way
|
|
122
|
+
# by AWS [1] and is the more performant way.
|
|
123
|
+
# [1] https://docs.aws.amazon.com/sdkref/latest/guide/feature-sts-regionalized-endpoints.html
|
|
124
|
+
os.environ["AWS_STS_REGIONAL_ENDPOINTS"] = "regional"
|
|
125
|
+
if token_info.get("region"):
|
|
126
|
+
os.environ["AWS_DEFAULT_REGION"] = token_info["region"]
|
|
127
|
+
|
|
128
|
+
if cspr_role:
|
|
129
|
+
# The generated AWS config will be used here since we set the
|
|
130
|
+
# AWS_CONFIG_FILE environment variable above.
|
|
131
|
+
if role_arn == USE_CSPR_ROLE_ARN_IF_SET:
|
|
132
|
+
# Otherwise start from the default profile, assuming CSPR role
|
|
133
|
+
session = boto3.session.Session(profile_name="cspr")
|
|
134
|
+
else:
|
|
135
|
+
session = boto3.session.Session(profile_name="task")
|
|
136
|
+
else:
|
|
137
|
+
# Not using AWS config, just AWS_WEB_IDENTITY_TOKEN_FILE + AWS_ROLE_ARN
|
|
138
|
+
session = boto3.session.Session()
|
|
139
|
+
|
|
140
|
+
if role_arn and role_arn != USE_CSPR_ROLE_ARN_IF_SET:
|
|
141
|
+
# If the user provided a role_arn, we assume that role
|
|
142
|
+
# using the task role credentials. CSPR role is not used.
|
|
143
|
+
fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
|
|
144
|
+
client_creator=session._session.create_client,
|
|
145
|
+
source_credentials=session._session.get_credentials(),
|
|
146
|
+
role_arn=role_arn,
|
|
147
|
+
extra_args={},
|
|
148
|
+
)
|
|
149
|
+
creds = botocore.credentials.DeferredRefreshableCredentials(
|
|
150
|
+
method="assume-role", refresh_using=fetcher.fetch_credentials
|
|
151
|
+
)
|
|
152
|
+
botocore_session = botocore.session.Session(session_vars=session_vars)
|
|
153
|
+
botocore_session._credentials = creds
|
|
154
|
+
return boto3.session.Session(botocore_session=botocore_session)
|
|
155
|
+
else:
|
|
156
|
+
# If the user didn't provide a role_arn, or if the role_arn
|
|
157
|
+
# is set to USE_CSPR_ROLE_ARN_IF_SET, we return the default
|
|
158
|
+
# session which would use the CSPR role if it is set on the
|
|
159
|
+
# server, and the task role otherwise.
|
|
160
|
+
return session
|
|
161
|
+
|
|
162
|
+
|
|
35
163
|
class ObpAuthProvider(object):
|
|
36
164
|
name = "obp"
|
|
37
165
|
|
|
@@ -42,67 +170,19 @@ class ObpAuthProvider(object):
|
|
|
42
170
|
if client_params is None:
|
|
43
171
|
client_params = {}
|
|
44
172
|
|
|
45
|
-
import boto3
|
|
46
|
-
import botocore
|
|
47
173
|
from botocore.exceptions import ClientError
|
|
48
|
-
from
|
|
49
|
-
|
|
50
|
-
from hashlib import sha256
|
|
51
|
-
from metaflow.util import get_username
|
|
52
|
-
|
|
53
|
-
user = get_username()
|
|
54
|
-
|
|
55
|
-
token_info = get_token("/generate/aws")
|
|
56
|
-
|
|
57
|
-
# Write token to a file. The file name is derived from the user name
|
|
58
|
-
# so it works with multiple users on the same machine.
|
|
59
|
-
#
|
|
60
|
-
# We hash the user name so we don't have to deal with special characters
|
|
61
|
-
# in the file name and the file name is not exposed to the user
|
|
62
|
-
# anyways, so it doesn't matter that its a little ugly.
|
|
63
|
-
token_file = "/tmp/obp_token." + sha256(user.encode("utf-8")).hexdigest()[:16]
|
|
64
|
-
|
|
65
|
-
# Write to a temp file then rename to avoid a situation when someone
|
|
66
|
-
# tries to read the file after it was open for writing (and truncated)
|
|
67
|
-
# but before the token was written to it.
|
|
68
|
-
with tempfile.NamedTemporaryFile("w", delete=False) as f:
|
|
69
|
-
f.write(token_info["token"])
|
|
70
|
-
tmp_token_file = f.name
|
|
71
|
-
os.rename(tmp_token_file, token_file)
|
|
72
|
-
|
|
73
|
-
os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = token_file
|
|
74
|
-
os.environ["AWS_ROLE_ARN"] = token_info["role_arn"]
|
|
75
|
-
|
|
76
|
-
# Enable regional STS endpoints. This is the new recommended way
|
|
77
|
-
# by AWS [1] and is the more performant way.
|
|
78
|
-
# [1] https://docs.aws.amazon.com/sdkref/latest/guide/feature-sts-regionalized-endpoints.html
|
|
79
|
-
os.environ["AWS_STS_REGIONAL_ENDPOINTS"] = "regional"
|
|
80
|
-
if token_info.get("region"):
|
|
81
|
-
os.environ["AWS_DEFAULT_REGION"] = token_info["region"]
|
|
174
|
+
from botocore.config import Config
|
|
82
175
|
|
|
83
176
|
with hide_access_keys():
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
role_arn=role_arn,
|
|
90
|
-
extra_args={},
|
|
91
|
-
)
|
|
92
|
-
creds = botocore.credentials.DeferredRefreshableCredentials(
|
|
93
|
-
method="assume-role", refresh_using=fetcher.fetch_credentials
|
|
94
|
-
)
|
|
95
|
-
botocore_session = botocore.session.Session(session_vars=session_vars)
|
|
96
|
-
botocore_session._credentials = creds
|
|
97
|
-
session = boto3.session.Session(botocore_session=botocore_session)
|
|
98
|
-
if with_error:
|
|
99
|
-
return session.client(module, **client_params), ClientError
|
|
100
|
-
else:
|
|
101
|
-
return session.client(module, **client_params)
|
|
177
|
+
session = get_boto3_session(role_arn, session_vars)
|
|
178
|
+
_client_params = client_params.copy()
|
|
179
|
+
if _client_params.get("config") and type(_client_params["config"]) == dict:
|
|
180
|
+
_client_params["config"] = Config(**_client_params["config"])
|
|
181
|
+
|
|
102
182
|
if with_error:
|
|
103
|
-
return
|
|
183
|
+
return session.client(module, **_client_params), ClientError
|
|
104
184
|
else:
|
|
105
|
-
return
|
|
185
|
+
return session.client(module, **_client_params)
|
|
106
186
|
|
|
107
187
|
|
|
108
188
|
AWS_CLIENT_PROVIDERS_DESC = [("obp", ".ObpAuthProvider")]
|
|
@@ -178,7 +258,6 @@ class ObpGcpAuthProvider(object):
|
|
|
178
258
|
|
|
179
259
|
@staticmethod
|
|
180
260
|
def get_gs_storage_client(*args, **kwargs):
|
|
181
|
-
|
|
182
261
|
import sys
|
|
183
262
|
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
184
263
|
|
|
@@ -240,3 +319,53 @@ class ObpGcpAuthProvider(object):
|
|
|
240
319
|
|
|
241
320
|
|
|
242
321
|
GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
|
|
322
|
+
CLIS_DESC = [
|
|
323
|
+
("nvidia", ".nvcf.nvcf_cli.cli"),
|
|
324
|
+
("nvct", ".nvct.nvct_cli.cli"),
|
|
325
|
+
("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
|
|
326
|
+
("snowpark", ".snowpark.snowpark_cli.cli"),
|
|
327
|
+
]
|
|
328
|
+
STEP_DECORATORS_DESC = [
|
|
329
|
+
("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator"),
|
|
330
|
+
("nvct", ".nvct.nvct_decorator.NvctDecorator"),
|
|
331
|
+
(
|
|
332
|
+
"fast_bakery_internal",
|
|
333
|
+
".fast_bakery.fast_bakery_decorator.InternalFastBakeryDecorator",
|
|
334
|
+
),
|
|
335
|
+
("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
|
|
336
|
+
("tensorboard", ".tensorboard.TensorboardDecorator"),
|
|
337
|
+
("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
|
|
338
|
+
("test_append_card", ".profilers.simple_card_decorator.DynamicCardAppendDecorator"),
|
|
339
|
+
("nim", ".nim.nim_decorator.NimDecorator"),
|
|
340
|
+
("ollama", ".ollama.OllamaDecorator"),
|
|
341
|
+
("vllm", ".vllm.VLLMDecorator"),
|
|
342
|
+
("s3_proxy", ".s3_proxy.s3_proxy_decorator.S3ProxyDecorator"),
|
|
343
|
+
("nebius_s3_proxy", ".s3_proxy.s3_proxy_decorator.NebiusS3ProxyDecorator"),
|
|
344
|
+
("coreweave_s3_proxy", ".s3_proxy.s3_proxy_decorator.CoreWeaveS3ProxyDecorator"),
|
|
345
|
+
(
|
|
346
|
+
"app_deploy_internal",
|
|
347
|
+
".apps.core.app_deploy_decorator.AppDeployInternalDecorator",
|
|
348
|
+
),
|
|
349
|
+
]
|
|
350
|
+
|
|
351
|
+
FLOW_DECORATORS_DESC = [
|
|
352
|
+
("app_deploy", ".apps.core.app_deploy_decorator.AppDeployFlowDecorator"),
|
|
353
|
+
]
|
|
354
|
+
|
|
355
|
+
TOGGLE_STEP_DECORATOR = [
|
|
356
|
+
"-batch",
|
|
357
|
+
"-step_functions_internal",
|
|
358
|
+
"-airflow_internal",
|
|
359
|
+
]
|
|
360
|
+
|
|
361
|
+
TOGGLE_CLI = ["-batch", "-step-functions", "-airflow"]
|
|
362
|
+
|
|
363
|
+
ENVIRONMENTS_DESC = [
|
|
364
|
+
("fast-bakery", ".fast_bakery.docker_environment.DockerEnvironment")
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
SECRETS_PROVIDERS_DESC = [
|
|
368
|
+
("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
|
|
369
|
+
]
|
|
370
|
+
# Adding an override here so the library can be imported at the metaflow.plugins level
|
|
371
|
+
__mf_promote_submodules__ = ["snowflake", "ollama", "torchtune", "optuna"]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
import os
|
|
3
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
4
|
+
import requests
|
|
5
|
+
import time
|
|
6
|
+
import random
|
|
7
|
+
|
|
8
|
+
# IMPORTANT: Currently contents of this file are mostly duplicated from the outerbounds package.
|
|
9
|
+
# This is purely due to the time rush of having to deliver this feature. As a fast forward, we
|
|
10
|
+
# will reorganize things in a way that the amount of duplication in minimum.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
APP_READY_POLL_TIMEOUT_SECONDS = 300
|
|
14
|
+
# Even after our backend validates that the app routes are ready, it takes a few seconds for
|
|
15
|
+
# the app to be accessible via the browser. Till we hunt down this delay, add an extra buffer.
|
|
16
|
+
APP_READY_EXTRA_BUFFER_SECONDS = 30
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def start_app(port=-1, name=""):
|
|
20
|
+
"""
|
|
21
|
+
Starts an app on the workstation.
|
|
22
|
+
List workstations, looks for "NamedPorts", then makes an update call to the NamedPorts for the workstation.
|
|
23
|
+
"""
|
|
24
|
+
if len(name) == 0 or len(name) >= 20:
|
|
25
|
+
raise MetaflowException("App name should not be more than 20 characters long.")
|
|
26
|
+
elif not name.isalnum() or not name.islower():
|
|
27
|
+
raise MetaflowException(
|
|
28
|
+
"App name can only contain lowercase alphanumeric characters."
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if "WORKSTATION_ID" not in os.environ:
|
|
32
|
+
raise MetaflowException(
|
|
33
|
+
"All outerbounds app commands can only be run from a workstation."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Every workstation has this environment variable set.
|
|
37
|
+
workstation_id = os.environ["WORKSTATION_ID"]
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
try:
|
|
41
|
+
conf = init_config()
|
|
42
|
+
metaflow_token = conf["METAFLOW_SERVICE_AUTH_KEY"]
|
|
43
|
+
api_url = conf["OBP_API_SERVER"]
|
|
44
|
+
|
|
45
|
+
workstations_response = requests.get(
|
|
46
|
+
f"https://{api_url}/v1/workstations",
|
|
47
|
+
headers={"x-api-key": metaflow_token},
|
|
48
|
+
)
|
|
49
|
+
workstations_response.raise_for_status()
|
|
50
|
+
except:
|
|
51
|
+
raise MetaflowException("Failed to list workstations!")
|
|
52
|
+
|
|
53
|
+
workstations_json = workstations_response.json()["workstations"]
|
|
54
|
+
for workstation in workstations_json:
|
|
55
|
+
if workstation["instance_id"] == os.environ["WORKSTATION_ID"]:
|
|
56
|
+
if "named_ports" in workstation["spec"]:
|
|
57
|
+
try:
|
|
58
|
+
ensure_app_start_request_is_valid(
|
|
59
|
+
workstation["spec"]["named_ports"], port, name
|
|
60
|
+
)
|
|
61
|
+
except ValueError as e:
|
|
62
|
+
raise MetaflowException(str(e))
|
|
63
|
+
|
|
64
|
+
for named_port in workstation["spec"]["named_ports"]:
|
|
65
|
+
if int(named_port["port"]) == port:
|
|
66
|
+
if named_port["enabled"] and named_port["name"] == name:
|
|
67
|
+
print(f"App {name} started on port {port}!")
|
|
68
|
+
print(
|
|
69
|
+
f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
70
|
+
)
|
|
71
|
+
print(
|
|
72
|
+
f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
73
|
+
)
|
|
74
|
+
return
|
|
75
|
+
else:
|
|
76
|
+
try:
|
|
77
|
+
response = requests.put(
|
|
78
|
+
f"https://{api_url}/v1/workstations/update/{workstation_id}/namedports",
|
|
79
|
+
headers={"x-api-key": metaflow_token},
|
|
80
|
+
json={
|
|
81
|
+
"port": port,
|
|
82
|
+
"name": name,
|
|
83
|
+
"enabled": True,
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
poll_success = wait_for_app_port_to_be_accessible(
|
|
89
|
+
api_url,
|
|
90
|
+
metaflow_token,
|
|
91
|
+
workstation_id,
|
|
92
|
+
name,
|
|
93
|
+
APP_READY_POLL_TIMEOUT_SECONDS,
|
|
94
|
+
)
|
|
95
|
+
if poll_success:
|
|
96
|
+
print(f"App {name} started on port {port}!")
|
|
97
|
+
print(
|
|
98
|
+
f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
99
|
+
)
|
|
100
|
+
print(
|
|
101
|
+
f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
raise MetaflowException(
|
|
105
|
+
f"The app could not be deployed in {APP_READY_POLL_TIMEOUT_SECONDS / 60} minutes. Please try again later."
|
|
106
|
+
)
|
|
107
|
+
except Exception:
|
|
108
|
+
raise MetaflowException(
|
|
109
|
+
f"Failed to start app {name} on port {port}!"
|
|
110
|
+
)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise MetaflowException(f"Failed to start app {name} on port {port}!")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def ensure_app_start_request_is_valid(existing_named_ports, port: int, name: str):
|
|
116
|
+
"""
|
|
117
|
+
Ensures that the port number is available on the workstation and that an app of
|
|
118
|
+
the same name is not already opened on a different port.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
existing_named_ports: A list of named ports on the workstation.
|
|
122
|
+
port: The port number to check.
|
|
123
|
+
name: The name of the app to check.
|
|
124
|
+
"""
|
|
125
|
+
existing_apps_by_port = {np["port"]: np for np in existing_named_ports}
|
|
126
|
+
|
|
127
|
+
if port not in existing_apps_by_port:
|
|
128
|
+
raise MetaflowException(f"Port {port} not found on workstation")
|
|
129
|
+
|
|
130
|
+
for existing_named_port in existing_named_ports:
|
|
131
|
+
if (
|
|
132
|
+
name == existing_named_port["name"]
|
|
133
|
+
and existing_named_port["port"] != port
|
|
134
|
+
and existing_named_port["enabled"]
|
|
135
|
+
):
|
|
136
|
+
raise MetaflowException(
|
|
137
|
+
f"App with name '{name}' is already deployed on port {existing_named_port['port']}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def wait_for_app_port_to_be_accessible(
|
|
142
|
+
api_url, metaflow_token, workstation_id, app_name, poll_timeout_seconds
|
|
143
|
+
) -> bool:
|
|
144
|
+
"""
|
|
145
|
+
Waits for the app to be ready by polling the workstation status.
|
|
146
|
+
"""
|
|
147
|
+
num_retries_per_request = 3
|
|
148
|
+
start_time = time.time()
|
|
149
|
+
retry_delay = 1.0
|
|
150
|
+
poll_interval = 10
|
|
151
|
+
wait_message = f"App {app_name} is currently being deployed..."
|
|
152
|
+
while time.time() - start_time < poll_timeout_seconds:
|
|
153
|
+
for _ in range(num_retries_per_request):
|
|
154
|
+
try:
|
|
155
|
+
workstations_response = requests.get(
|
|
156
|
+
f"https://{api_url}/v1/workstations",
|
|
157
|
+
headers={"x-api-key": metaflow_token},
|
|
158
|
+
)
|
|
159
|
+
workstations_response.raise_for_status()
|
|
160
|
+
if is_app_ready(workstations_response.json(), workstation_id, app_name):
|
|
161
|
+
print(wait_message)
|
|
162
|
+
time.sleep(APP_READY_EXTRA_BUFFER_SECONDS)
|
|
163
|
+
return True
|
|
164
|
+
else:
|
|
165
|
+
print(wait_message)
|
|
166
|
+
time.sleep(poll_interval)
|
|
167
|
+
except (
|
|
168
|
+
requests.exceptions.ConnectionError,
|
|
169
|
+
requests.exceptions.ReadTimeout,
|
|
170
|
+
):
|
|
171
|
+
time.sleep(retry_delay)
|
|
172
|
+
retry_delay *= 2 # Double the delay for the next attempt
|
|
173
|
+
retry_delay += random.uniform(0, 1) # Add jitter
|
|
174
|
+
retry_delay = min(retry_delay, 10)
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def is_app_ready(response_json: dict, workstation_id: str, app_name: str) -> bool:
|
|
179
|
+
"""Checks if the app is ready in the given workstation's response."""
|
|
180
|
+
workstations = response_json.get("workstations", [])
|
|
181
|
+
for workstation in workstations:
|
|
182
|
+
if workstation.get("instance_id") == workstation_id:
|
|
183
|
+
hosted_apps = workstation.get("status", {}).get("hosted_apps", [])
|
|
184
|
+
for hosted_app in hosted_apps:
|
|
185
|
+
if hosted_app.get("name") == app_name:
|
|
186
|
+
return bool(hosted_app.get("ready"))
|
|
187
|
+
return False
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from . import config
|
|
2
|
+
from . import dependencies
|
|
3
|
+
from . import capsule
|
|
4
|
+
from . import utils
|
|
5
|
+
from . import app_config
|
|
6
|
+
from . import code_package
|
|
7
|
+
from .deployer import AppDeployer, bake_image, package_code, DeployedApp
|
|
8
|
+
from .config import BakedImage, PackagedCode
|
|
9
|
+
from .config.typed_configs import (
|
|
10
|
+
ReplicaConfigDict,
|
|
11
|
+
ResourceConfigDict,
|
|
12
|
+
AuthConfigDict,
|
|
13
|
+
DependencyConfigDict,
|
|
14
|
+
PackageConfigDict,
|
|
15
|
+
)
|