ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (128) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -7
  2. metaflow_extensions/outerbounds/config/__init__.py +35 -0
  3. metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
  4. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  7. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  35. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  36. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  37. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  38. metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
  39. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  40. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  41. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  42. metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  43. metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
  44. metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
  45. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
  46. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  47. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  48. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  49. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  50. metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  51. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  52. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
  53. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
  54. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
  55. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
  56. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
  57. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  58. metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
  59. metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
  60. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
  61. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  62. metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  63. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  64. metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
  65. metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
  66. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
  67. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
  68. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
  69. metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
  70. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  71. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  72. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  73. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  74. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  75. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  76. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  77. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  78. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  79. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  80. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  81. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  82. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  83. metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
  84. metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
  85. metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
  86. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  87. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  88. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  89. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  90. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  91. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  92. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  93. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  94. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  95. metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  96. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
  97. metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
  98. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
  99. metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  100. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
  101. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
  102. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
  103. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
  104. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
  105. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
  106. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
  107. metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
  108. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  109. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  110. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  111. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  112. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  113. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  114. metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
  115. metaflow_extensions/outerbounds/remote_config.py +53 -16
  116. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
  117. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  118. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  119. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  120. metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
  121. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  122. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  123. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  124. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  125. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  126. ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
  127. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  128. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,6 @@
1
1
  import metaflow.metaflow_config_funcs
2
2
 
3
- from metaflow_extensions.outerbounds.remote_config import init_config
4
-
5
- from metaflow_extensions.outerbounds.plugins.perimeters import (
6
- set_current_perimeter_config_url_in_environment,
7
- )
8
-
9
- set_current_perimeter_config_url_in_environment()
3
+ from metaflow_extensions.outerbounds.remote_config import init_config, reload_config
10
4
 
11
5
  # we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
12
6
  # we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
@@ -1,5 +1,40 @@
1
+ from metaflow.metaflow_config import from_conf
2
+
1
3
  DEFAULT_AWS_CLIENT_PROVIDER = "obp"
2
4
 
3
5
  DEFAULT_AZURE_CLIENT_PROVIDER = "obp"
4
6
 
5
7
  DEFAULT_GCP_CLIENT_PROVIDER = "obp"
8
+
9
+
10
+ ###
11
+ # On Demand Docker image build configuration
12
+ ###
13
+ # Image builder service url
14
+ FAST_BAKERY_URL = from_conf("FAST_BAKERY_URL", None)
15
+
16
+
17
+ ###
18
+ # NVCF configuration
19
+ ###
20
+ # Maximum number of consecutive heartbeats that can be missed.
21
+ NVIDIA_HEARTBEAT_THRESHOLD = from_conf("NVIDIA_HEARTBEAT_THRESHOLD", "3")
22
+
23
+
24
+ ###
25
+ # Snowpark configuration
26
+ ###
27
+ # Snowflake account to use with the @snowpark decorator
28
+ SNOWPARK_ACCOUNT = from_conf("SNOWPARK_ACCOUNT")
29
+ # Snowflake user to use with the @snowpark decorator
30
+ SNOWPARK_USER = from_conf("SNOWPARK_USER")
31
+ # Snowflake password to use with the @snowpark decorator
32
+ SNOWPARK_PASSWORD = from_conf("SNOWPARK_PASSWORD")
33
+ # Snowflake role to use with the @snowpark decorator
34
+ SNOWPARK_ROLE = from_conf("SNOWPARK_ROLE")
35
+ # Snowflake database to use with the @snowpark decorator
36
+ SNOWPARK_DATABASE = from_conf("SNOWPARK_DATABASE")
37
+ # Snowflake warehouse to use with the @snowpark decorator
38
+ SNOWPARK_WAREHOUSE = from_conf("SNOWPARK_WAREHOUSE")
39
+ # Snowflake schema to use with the @snowpark decorator
40
+ SNOWPARK_SCHEMA = from_conf("SNOWPARK_SCHEMA")
@@ -32,6 +32,134 @@ def hide_access_keys(*args, **kwds):
32
32
  os.environ["AWS_SESSION_TOKEN"] = AWS_SESSION_TOKEN
33
33
 
34
34
 
35
+ # This is a special placeholder value that can be passed as role_arn to
36
+ # get_boto3_session() which makes it use the CSPR role, if its set.
37
+ USE_CSPR_ROLE_ARN_IF_SET = "__cspr__"
38
+
39
+
40
+ def get_boto3_session(role_arn=None, session_vars=None):
41
+ import boto3
42
+ import botocore
43
+ from metaflow_extensions.outerbounds.plugins.auth_server import get_token
44
+ from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
45
+ OBP_ASSUME_ROLE_ARN_ENV_VAR,
46
+ )
47
+
48
+ from hashlib import sha256
49
+ from metaflow.util import get_username
50
+
51
+ user = get_username()
52
+
53
+ token_info = get_token("/generate/aws")
54
+
55
+ # Write token to a file. The file name is derived from the user name
56
+ # so it works with multiple users on the same machine.
57
+ #
58
+ # We hash the user name so we don't have to deal with special characters
59
+ # in the file name and the file name is not exposed to the user
60
+ # anyways, so it doesn't matter that its a little ugly.
61
+ token_file = "/tmp/obp_token." + sha256(user.encode("utf-8")).hexdigest()[:16]
62
+
63
+ # Write to a temp file then rename to avoid a situation when someone
64
+ # tries to read the file after it was open for writing (and truncated)
65
+ # but before the token was written to it.
66
+ with tempfile.NamedTemporaryFile("w", delete=False) as f:
67
+ f.write(token_info["token"])
68
+ tmp_token_file = f.name
69
+ os.rename(tmp_token_file, token_file)
70
+
71
+ cspr_role = None
72
+ if token_info.get("cspr_role_arn"):
73
+ cspr_role = token_info["cspr_role_arn"]
74
+
75
+ # Check if the assume_role decorator has set a CSPR ARN via environment variable
76
+ # This takes precedence over CSPR role that comes from the token_info response
77
+ decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
78
+ if decorator_role_arn:
79
+ cspr_role = decorator_role_arn
80
+
81
+ if cspr_role:
82
+ # If CSPR role is set, we set it as the default role to assume
83
+ # for the AWS SDK. We do this by writing an AWS config file
84
+ # with two profiles. One to get credentials for the task role
85
+ # in exchange for the OIDC token, and second to assume the
86
+ # CSPR role using the task role credentials.
87
+ import configparser
88
+ from io import StringIO
89
+
90
+ aws_config = configparser.ConfigParser()
91
+
92
+ # Task role profile
93
+ aws_config["profile task"] = {
94
+ "role_arn": token_info["role_arn"],
95
+ "web_identity_token_file": token_file,
96
+ }
97
+
98
+ # CSPR role profile (default)
99
+ aws_config["profile cspr"] = {
100
+ "role_arn": cspr_role,
101
+ "source_profile": "task",
102
+ }
103
+
104
+ aws_config_string = StringIO()
105
+ aws_config.write(aws_config_string)
106
+ aws_config_file = (
107
+ "/tmp/aws_config." + sha256(user.encode("utf-8")).hexdigest()[:16]
108
+ )
109
+ with tempfile.NamedTemporaryFile(
110
+ "w", delete=False, dir=os.path.dirname(aws_config_file)
111
+ ) as f:
112
+ f.write(aws_config_string.getvalue())
113
+ tmp_aws_config_file = f.name
114
+ os.rename(tmp_aws_config_file, aws_config_file)
115
+ os.environ["AWS_CONFIG_FILE"] = aws_config_file
116
+ os.environ["AWS_PROFILE"] = "cspr"
117
+ else:
118
+ os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = token_file
119
+ os.environ["AWS_ROLE_ARN"] = token_info["role_arn"]
120
+
121
+ # Enable regional STS endpoints. This is the new recommended way
122
+ # by AWS [1] and is the more performant way.
123
+ # [1] https://docs.aws.amazon.com/sdkref/latest/guide/feature-sts-regionalized-endpoints.html
124
+ os.environ["AWS_STS_REGIONAL_ENDPOINTS"] = "regional"
125
+ if token_info.get("region"):
126
+ os.environ["AWS_DEFAULT_REGION"] = token_info["region"]
127
+
128
+ if cspr_role:
129
+ # The generated AWS config will be used here since we set the
130
+ # AWS_CONFIG_FILE environment variable above.
131
+ if role_arn == USE_CSPR_ROLE_ARN_IF_SET:
132
+ # Otherwise start from the default profile, assuming CSPR role
133
+ session = boto3.session.Session(profile_name="cspr")
134
+ else:
135
+ session = boto3.session.Session(profile_name="task")
136
+ else:
137
+ # Not using AWS config, just AWS_WEB_IDENTITY_TOKEN_FILE + AWS_ROLE_ARN
138
+ session = boto3.session.Session()
139
+
140
+ if role_arn and role_arn != USE_CSPR_ROLE_ARN_IF_SET:
141
+ # If the user provided a role_arn, we assume that role
142
+ # using the task role credentials. CSPR role is not used.
143
+ fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
144
+ client_creator=session._session.create_client,
145
+ source_credentials=session._session.get_credentials(),
146
+ role_arn=role_arn,
147
+ extra_args={},
148
+ )
149
+ creds = botocore.credentials.DeferredRefreshableCredentials(
150
+ method="assume-role", refresh_using=fetcher.fetch_credentials
151
+ )
152
+ botocore_session = botocore.session.Session(session_vars=session_vars)
153
+ botocore_session._credentials = creds
154
+ return boto3.session.Session(botocore_session=botocore_session)
155
+ else:
156
+ # If the user didn't provide a role_arn, or if the role_arn
157
+ # is set to USE_CSPR_ROLE_ARN_IF_SET, we return the default
158
+ # session which would use the CSPR role if it is set on the
159
+ # server, and the task role otherwise.
160
+ return session
161
+
162
+
35
163
  class ObpAuthProvider(object):
36
164
  name = "obp"
37
165
 
@@ -42,67 +170,19 @@ class ObpAuthProvider(object):
42
170
  if client_params is None:
43
171
  client_params = {}
44
172
 
45
- import boto3
46
- import botocore
47
173
  from botocore.exceptions import ClientError
48
- from metaflow_extensions.outerbounds.plugins.auth_server import get_token
49
-
50
- from hashlib import sha256
51
- from metaflow.util import get_username
52
-
53
- user = get_username()
54
-
55
- token_info = get_token("/generate/aws")
56
-
57
- # Write token to a file. The file name is derived from the user name
58
- # so it works with multiple users on the same machine.
59
- #
60
- # We hash the user name so we don't have to deal with special characters
61
- # in the file name and the file name is not exposed to the user
62
- # anyways, so it doesn't matter that its a little ugly.
63
- token_file = "/tmp/obp_token." + sha256(user.encode("utf-8")).hexdigest()[:16]
64
-
65
- # Write to a temp file then rename to avoid a situation when someone
66
- # tries to read the file after it was open for writing (and truncated)
67
- # but before the token was written to it.
68
- with tempfile.NamedTemporaryFile("w", delete=False) as f:
69
- f.write(token_info["token"])
70
- tmp_token_file = f.name
71
- os.rename(tmp_token_file, token_file)
72
-
73
- os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = token_file
74
- os.environ["AWS_ROLE_ARN"] = token_info["role_arn"]
75
-
76
- # Enable regional STS endpoints. This is the new recommended way
77
- # by AWS [1] and is the more performant way.
78
- # [1] https://docs.aws.amazon.com/sdkref/latest/guide/feature-sts-regionalized-endpoints.html
79
- os.environ["AWS_STS_REGIONAL_ENDPOINTS"] = "regional"
80
- if token_info.get("region"):
81
- os.environ["AWS_DEFAULT_REGION"] = token_info["region"]
174
+ from botocore.config import Config
82
175
 
83
176
  with hide_access_keys():
84
- if role_arn:
85
- session = boto3.session.Session()
86
- fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
87
- client_creator=session._session.create_client,
88
- source_credentials=session._session.get_credentials(),
89
- role_arn=role_arn,
90
- extra_args={},
91
- )
92
- creds = botocore.credentials.DeferredRefreshableCredentials(
93
- method="assume-role", refresh_using=fetcher.fetch_credentials
94
- )
95
- botocore_session = botocore.session.Session(session_vars=session_vars)
96
- botocore_session._credentials = creds
97
- session = boto3.session.Session(botocore_session=botocore_session)
98
- if with_error:
99
- return session.client(module, **client_params), ClientError
100
- else:
101
- return session.client(module, **client_params)
177
+ session = get_boto3_session(role_arn, session_vars)
178
+ _client_params = client_params.copy()
179
+ if _client_params.get("config") and type(_client_params["config"]) == dict:
180
+ _client_params["config"] = Config(**_client_params["config"])
181
+
102
182
  if with_error:
103
- return boto3.client(module, **client_params), ClientError
183
+ return session.client(module, **_client_params), ClientError
104
184
  else:
105
- return boto3.client(module, **client_params)
185
+ return session.client(module, **_client_params)
106
186
 
107
187
 
108
188
  AWS_CLIENT_PROVIDERS_DESC = [("obp", ".ObpAuthProvider")]
@@ -178,7 +258,6 @@ class ObpGcpAuthProvider(object):
178
258
 
179
259
  @staticmethod
180
260
  def get_gs_storage_client(*args, **kwargs):
181
-
182
261
  import sys
183
262
  from metaflow_extensions.outerbounds.plugins.auth_server import get_token
184
263
 
@@ -240,3 +319,53 @@ class ObpGcpAuthProvider(object):
240
319
 
241
320
 
242
321
  GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
322
+ CLIS_DESC = [
323
+ ("nvidia", ".nvcf.nvcf_cli.cli"),
324
+ ("nvct", ".nvct.nvct_cli.cli"),
325
+ ("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
326
+ ("snowpark", ".snowpark.snowpark_cli.cli"),
327
+ ]
328
+ STEP_DECORATORS_DESC = [
329
+ ("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator"),
330
+ ("nvct", ".nvct.nvct_decorator.NvctDecorator"),
331
+ (
332
+ "fast_bakery_internal",
333
+ ".fast_bakery.fast_bakery_decorator.InternalFastBakeryDecorator",
334
+ ),
335
+ ("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
336
+ ("tensorboard", ".tensorboard.TensorboardDecorator"),
337
+ ("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
338
+ ("test_append_card", ".profilers.simple_card_decorator.DynamicCardAppendDecorator"),
339
+ ("nim", ".nim.nim_decorator.NimDecorator"),
340
+ ("ollama", ".ollama.OllamaDecorator"),
341
+ ("vllm", ".vllm.VLLMDecorator"),
342
+ ("s3_proxy", ".s3_proxy.s3_proxy_decorator.S3ProxyDecorator"),
343
+ ("nebius_s3_proxy", ".s3_proxy.s3_proxy_decorator.NebiusS3ProxyDecorator"),
344
+ ("coreweave_s3_proxy", ".s3_proxy.s3_proxy_decorator.CoreWeaveS3ProxyDecorator"),
345
+ (
346
+ "app_deploy_internal",
347
+ ".apps.core.app_deploy_decorator.AppDeployInternalDecorator",
348
+ ),
349
+ ]
350
+
351
+ FLOW_DECORATORS_DESC = [
352
+ ("app_deploy", ".apps.core.app_deploy_decorator.AppDeployFlowDecorator"),
353
+ ]
354
+
355
+ TOGGLE_STEP_DECORATOR = [
356
+ "-batch",
357
+ "-step_functions_internal",
358
+ "-airflow_internal",
359
+ ]
360
+
361
+ TOGGLE_CLI = ["-batch", "-step-functions", "-airflow"]
362
+
363
+ ENVIRONMENTS_DESC = [
364
+ ("fast-bakery", ".fast_bakery.docker_environment.DockerEnvironment")
365
+ ]
366
+
367
+ SECRETS_PROVIDERS_DESC = [
368
+ ("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
369
+ ]
370
+ # Adding an override here so the library can be imported at the metaflow.plugins level
371
+ __mf_promote_submodules__ = ["snowflake", "ollama", "torchtune", "optuna"]
@@ -0,0 +1,187 @@
1
+ from metaflow.exception import MetaflowException
2
+ import os
3
+ from metaflow.metaflow_config_funcs import init_config
4
+ import requests
5
+ import time
6
+ import random
7
+
8
+ # IMPORTANT: Currently contents of this file are mostly duplicated from the outerbounds package.
9
+ # This is purely due to the time rush of having to deliver this feature. As a fast forward, we
10
+ # will reorganize things in a way that the amount of duplication in minimum.
11
+
12
+
13
+ APP_READY_POLL_TIMEOUT_SECONDS = 300
14
+ # Even after our backend validates that the app routes are ready, it takes a few seconds for
15
+ # the app to be accessible via the browser. Till we hunt down this delay, add an extra buffer.
16
+ APP_READY_EXTRA_BUFFER_SECONDS = 30
17
+
18
+
19
+ def start_app(port=-1, name=""):
20
+ """
21
+ Starts an app on the workstation.
22
+ List workstations, looks for "NamedPorts", then makes an update call to the NamedPorts for the workstation.
23
+ """
24
+ if len(name) == 0 or len(name) >= 20:
25
+ raise MetaflowException("App name should not be more than 20 characters long.")
26
+ elif not name.isalnum() or not name.islower():
27
+ raise MetaflowException(
28
+ "App name can only contain lowercase alphanumeric characters."
29
+ )
30
+
31
+ if "WORKSTATION_ID" not in os.environ:
32
+ raise MetaflowException(
33
+ "All outerbounds app commands can only be run from a workstation."
34
+ )
35
+
36
+ # Every workstation has this environment variable set.
37
+ workstation_id = os.environ["WORKSTATION_ID"]
38
+
39
+ try:
40
+ try:
41
+ conf = init_config()
42
+ metaflow_token = conf["METAFLOW_SERVICE_AUTH_KEY"]
43
+ api_url = conf["OBP_API_SERVER"]
44
+
45
+ workstations_response = requests.get(
46
+ f"https://{api_url}/v1/workstations",
47
+ headers={"x-api-key": metaflow_token},
48
+ )
49
+ workstations_response.raise_for_status()
50
+ except:
51
+ raise MetaflowException("Failed to list workstations!")
52
+
53
+ workstations_json = workstations_response.json()["workstations"]
54
+ for workstation in workstations_json:
55
+ if workstation["instance_id"] == os.environ["WORKSTATION_ID"]:
56
+ if "named_ports" in workstation["spec"]:
57
+ try:
58
+ ensure_app_start_request_is_valid(
59
+ workstation["spec"]["named_ports"], port, name
60
+ )
61
+ except ValueError as e:
62
+ raise MetaflowException(str(e))
63
+
64
+ for named_port in workstation["spec"]["named_ports"]:
65
+ if int(named_port["port"]) == port:
66
+ if named_port["enabled"] and named_port["name"] == name:
67
+ print(f"App {name} started on port {port}!")
68
+ print(
69
+ f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
70
+ )
71
+ print(
72
+ f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
73
+ )
74
+ return
75
+ else:
76
+ try:
77
+ response = requests.put(
78
+ f"https://{api_url}/v1/workstations/update/{workstation_id}/namedports",
79
+ headers={"x-api-key": metaflow_token},
80
+ json={
81
+ "port": port,
82
+ "name": name,
83
+ "enabled": True,
84
+ },
85
+ )
86
+
87
+ response.raise_for_status()
88
+ poll_success = wait_for_app_port_to_be_accessible(
89
+ api_url,
90
+ metaflow_token,
91
+ workstation_id,
92
+ name,
93
+ APP_READY_POLL_TIMEOUT_SECONDS,
94
+ )
95
+ if poll_success:
96
+ print(f"App {name} started on port {port}!")
97
+ print(
98
+ f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
99
+ )
100
+ print(
101
+ f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
102
+ )
103
+ else:
104
+ raise MetaflowException(
105
+ f"The app could not be deployed in {APP_READY_POLL_TIMEOUT_SECONDS / 60} minutes. Please try again later."
106
+ )
107
+ except Exception:
108
+ raise MetaflowException(
109
+ f"Failed to start app {name} on port {port}!"
110
+ )
111
+ except Exception as e:
112
+ raise MetaflowException(f"Failed to start app {name} on port {port}!")
113
+
114
+
115
+ def ensure_app_start_request_is_valid(existing_named_ports, port: int, name: str):
116
+ """
117
+ Ensures that the port number is available on the workstation and that an app of
118
+ the same name is not already opened on a different port.
119
+
120
+ Args:
121
+ existing_named_ports: A list of named ports on the workstation.
122
+ port: The port number to check.
123
+ name: The name of the app to check.
124
+ """
125
+ existing_apps_by_port = {np["port"]: np for np in existing_named_ports}
126
+
127
+ if port not in existing_apps_by_port:
128
+ raise MetaflowException(f"Port {port} not found on workstation")
129
+
130
+ for existing_named_port in existing_named_ports:
131
+ if (
132
+ name == existing_named_port["name"]
133
+ and existing_named_port["port"] != port
134
+ and existing_named_port["enabled"]
135
+ ):
136
+ raise MetaflowException(
137
+ f"App with name '{name}' is already deployed on port {existing_named_port['port']}"
138
+ )
139
+
140
+
141
+ def wait_for_app_port_to_be_accessible(
142
+ api_url, metaflow_token, workstation_id, app_name, poll_timeout_seconds
143
+ ) -> bool:
144
+ """
145
+ Waits for the app to be ready by polling the workstation status.
146
+ """
147
+ num_retries_per_request = 3
148
+ start_time = time.time()
149
+ retry_delay = 1.0
150
+ poll_interval = 10
151
+ wait_message = f"App {app_name} is currently being deployed..."
152
+ while time.time() - start_time < poll_timeout_seconds:
153
+ for _ in range(num_retries_per_request):
154
+ try:
155
+ workstations_response = requests.get(
156
+ f"https://{api_url}/v1/workstations",
157
+ headers={"x-api-key": metaflow_token},
158
+ )
159
+ workstations_response.raise_for_status()
160
+ if is_app_ready(workstations_response.json(), workstation_id, app_name):
161
+ print(wait_message)
162
+ time.sleep(APP_READY_EXTRA_BUFFER_SECONDS)
163
+ return True
164
+ else:
165
+ print(wait_message)
166
+ time.sleep(poll_interval)
167
+ except (
168
+ requests.exceptions.ConnectionError,
169
+ requests.exceptions.ReadTimeout,
170
+ ):
171
+ time.sleep(retry_delay)
172
+ retry_delay *= 2 # Double the delay for the next attempt
173
+ retry_delay += random.uniform(0, 1) # Add jitter
174
+ retry_delay = min(retry_delay, 10)
175
+ return False
176
+
177
+
178
+ def is_app_ready(response_json: dict, workstation_id: str, app_name: str) -> bool:
179
+ """Checks if the app is ready in the given workstation's response."""
180
+ workstations = response_json.get("workstations", [])
181
+ for workstation in workstations:
182
+ if workstation.get("instance_id") == workstation_id:
183
+ hosted_apps = workstation.get("status", {}).get("hosted_apps", [])
184
+ for hosted_app in hosted_apps:
185
+ if hosted_app.get("name") == app_name:
186
+ return bool(hosted_app.get("ready"))
187
+ return False
@@ -0,0 +1,3 @@
1
+ DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START = 10
2
+ BASE_DIR_FOR_APP_ASSETS = "/home/ob-workspace/.appdaemon/apps/"
3
+ APP_DAEMON_WORKSTAION_PATH = "/home/ob-workspace/.appdaemon"
@@ -0,0 +1,15 @@
1
+ from . import config
2
+ from . import dependencies
3
+ from . import capsule
4
+ from . import utils
5
+ from . import app_config
6
+ from . import code_package
7
+ from .deployer import AppDeployer, bake_image, package_code, DeployedApp
8
+ from .config import BakedImage, PackagedCode
9
+ from .config.typed_configs import (
10
+ ReplicaConfigDict,
11
+ ResourceConfigDict,
12
+ AuthConfigDict,
13
+ DependencyConfigDict,
14
+ PackageConfigDict,
15
+ )