ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (105) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
  3. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  6. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  35. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  36. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  37. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  38. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  39. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  40. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
  41. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  42. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  43. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  44. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  45. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  46. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
  47. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
  48. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  49. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  50. metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
  51. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  52. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  53. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  54. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  55. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
  56. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
  57. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  58. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  59. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  60. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  61. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  62. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  63. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  64. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  65. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  66. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  67. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  68. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  69. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  70. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  71. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  72. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  73. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  74. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  75. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  76. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  77. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  78. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  79. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
  80. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
  81. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
  82. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
  83. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
  84. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
  85. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
  86. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  87. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  88. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  89. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  90. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  91. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  92. metaflow_extensions/outerbounds/remote_config.py +46 -9
  93. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
  94. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  95. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  96. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  97. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  98. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  99. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  100. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  101. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  102. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  103. ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
  104. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  105. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import metaflow.metaflow_config_funcs
2
2
 
3
- from metaflow_extensions.outerbounds.remote_config import init_config
3
+ from metaflow_extensions.outerbounds.remote_config import init_config, reload_config
4
4
 
5
5
  # we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
6
6
  # we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
@@ -41,6 +41,9 @@ def get_boto3_session(role_arn=None, session_vars=None):
41
41
  import boto3
42
42
  import botocore
43
43
  from metaflow_extensions.outerbounds.plugins.auth_server import get_token
44
+ from metaflow_extensions.outerbounds.plugins.aws.assume_role import (
45
+ OBP_ASSUME_ROLE_ARN_ENV_VAR,
46
+ )
44
47
 
45
48
  from hashlib import sha256
46
49
  from metaflow.util import get_username
@@ -69,6 +72,12 @@ def get_boto3_session(role_arn=None, session_vars=None):
69
72
  if token_info.get("cspr_role_arn"):
70
73
  cspr_role = token_info["cspr_role_arn"]
71
74
 
75
+ # Check if the assume_role decorator has set a CSPR ARN via environment variable
76
+ # This takes precedence over CSPR role that comes from the token_info response
77
+ decorator_role_arn = os.environ.get(OBP_ASSUME_ROLE_ARN_ENV_VAR)
78
+ if decorator_role_arn:
79
+ cspr_role = decorator_role_arn
80
+
72
81
  if cspr_role:
73
82
  # If CSPR role is set, we set it as the default role to assume
74
83
  # for the AWS SDK. We do this by writing an AWS config file
@@ -162,13 +171,18 @@ class ObpAuthProvider(object):
162
171
  client_params = {}
163
172
 
164
173
  from botocore.exceptions import ClientError
174
+ from botocore.config import Config
165
175
 
166
176
  with hide_access_keys():
167
177
  session = get_boto3_session(role_arn, session_vars)
178
+ _client_params = client_params.copy()
179
+ if _client_params.get("config") and type(_client_params["config"]) == dict:
180
+ _client_params["config"] = Config(**_client_params["config"])
181
+
168
182
  if with_error:
169
- return session.client(module, **client_params), ClientError
183
+ return session.client(module, **_client_params), ClientError
170
184
  else:
171
- return session.client(module, **client_params)
185
+ return session.client(module, **_client_params)
172
186
 
173
187
 
174
188
  AWS_CLIENT_PROVIDERS_DESC = [("obp", ".ObpAuthProvider")]
@@ -307,11 +321,13 @@ class ObpGcpAuthProvider(object):
307
321
  GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
308
322
  CLIS_DESC = [
309
323
  ("nvidia", ".nvcf.nvcf_cli.cli"),
324
+ ("nvct", ".nvct.nvct_cli.cli"),
310
325
  ("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
311
326
  ("snowpark", ".snowpark.snowpark_cli.cli"),
312
327
  ]
313
328
  STEP_DECORATORS_DESC = [
314
329
  ("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator"),
330
+ ("nvct", ".nvct.nvct_decorator.NvctDecorator"),
315
331
  (
316
332
  "fast_bakery_internal",
317
333
  ".fast_bakery.fast_bakery_decorator.InternalFastBakeryDecorator",
@@ -319,7 +335,21 @@ STEP_DECORATORS_DESC = [
319
335
  ("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
320
336
  ("tensorboard", ".tensorboard.TensorboardDecorator"),
321
337
  ("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
322
- ("nim", ".nim.NimDecorator"),
338
+ ("test_append_card", ".profilers.simple_card_decorator.DynamicCardAppendDecorator"),
339
+ ("nim", ".nim.nim_decorator.NimDecorator"),
340
+ ("ollama", ".ollama.OllamaDecorator"),
341
+ ("vllm", ".vllm.VLLMDecorator"),
342
+ ("s3_proxy", ".s3_proxy.s3_proxy_decorator.S3ProxyDecorator"),
343
+ ("nebius_s3_proxy", ".s3_proxy.s3_proxy_decorator.NebiusS3ProxyDecorator"),
344
+ ("coreweave_s3_proxy", ".s3_proxy.s3_proxy_decorator.CoreWeaveS3ProxyDecorator"),
345
+ (
346
+ "app_deploy_internal",
347
+ ".apps.core.app_deploy_decorator.AppDeployInternalDecorator",
348
+ ),
349
+ ]
350
+
351
+ FLOW_DECORATORS_DESC = [
352
+ ("app_deploy", ".apps.core.app_deploy_decorator.AppDeployFlowDecorator"),
323
353
  ]
324
354
 
325
355
  TOGGLE_STEP_DECORATOR = [
@@ -338,4 +368,4 @@ SECRETS_PROVIDERS_DESC = [
338
368
  ("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
339
369
  ]
340
370
  # Adding an override here so the library can be imported at the metaflow.plugins level
341
- __mf_promote_submodules__ = ["snowflake"]
371
+ __mf_promote_submodules__ = ["snowflake", "ollama", "torchtune", "optuna"]
@@ -0,0 +1,187 @@
1
+ from metaflow.exception import MetaflowException
2
+ import os
3
+ from metaflow.metaflow_config_funcs import init_config
4
+ import requests
5
+ import time
6
+ import random
7
+
8
+ # IMPORTANT: Currently contents of this file are mostly duplicated from the outerbounds package.
9
+ # This is purely due to the time rush of having to deliver this feature. As a fast forward, we
10
+ # will reorganize things in a way that the amount of duplication in minimum.
11
+
12
+
13
+ APP_READY_POLL_TIMEOUT_SECONDS = 300
14
+ # Even after our backend validates that the app routes are ready, it takes a few seconds for
15
+ # the app to be accessible via the browser. Till we hunt down this delay, add an extra buffer.
16
+ APP_READY_EXTRA_BUFFER_SECONDS = 30
17
+
18
+
19
+ def start_app(port=-1, name=""):
20
+ """
21
+ Starts an app on the workstation.
22
+ List workstations, looks for "NamedPorts", then makes an update call to the NamedPorts for the workstation.
23
+ """
24
+ if len(name) == 0 or len(name) >= 20:
25
+ raise MetaflowException("App name should not be more than 20 characters long.")
26
+ elif not name.isalnum() or not name.islower():
27
+ raise MetaflowException(
28
+ "App name can only contain lowercase alphanumeric characters."
29
+ )
30
+
31
+ if "WORKSTATION_ID" not in os.environ:
32
+ raise MetaflowException(
33
+ "All outerbounds app commands can only be run from a workstation."
34
+ )
35
+
36
+ # Every workstation has this environment variable set.
37
+ workstation_id = os.environ["WORKSTATION_ID"]
38
+
39
+ try:
40
+ try:
41
+ conf = init_config()
42
+ metaflow_token = conf["METAFLOW_SERVICE_AUTH_KEY"]
43
+ api_url = conf["OBP_API_SERVER"]
44
+
45
+ workstations_response = requests.get(
46
+ f"https://{api_url}/v1/workstations",
47
+ headers={"x-api-key": metaflow_token},
48
+ )
49
+ workstations_response.raise_for_status()
50
+ except:
51
+ raise MetaflowException("Failed to list workstations!")
52
+
53
+ workstations_json = workstations_response.json()["workstations"]
54
+ for workstation in workstations_json:
55
+ if workstation["instance_id"] == os.environ["WORKSTATION_ID"]:
56
+ if "named_ports" in workstation["spec"]:
57
+ try:
58
+ ensure_app_start_request_is_valid(
59
+ workstation["spec"]["named_ports"], port, name
60
+ )
61
+ except ValueError as e:
62
+ raise MetaflowException(str(e))
63
+
64
+ for named_port in workstation["spec"]["named_ports"]:
65
+ if int(named_port["port"]) == port:
66
+ if named_port["enabled"] and named_port["name"] == name:
67
+ print(f"App {name} started on port {port}!")
68
+ print(
69
+ f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
70
+ )
71
+ print(
72
+ f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
73
+ )
74
+ return
75
+ else:
76
+ try:
77
+ response = requests.put(
78
+ f"https://{api_url}/v1/workstations/update/{workstation_id}/namedports",
79
+ headers={"x-api-key": metaflow_token},
80
+ json={
81
+ "port": port,
82
+ "name": name,
83
+ "enabled": True,
84
+ },
85
+ )
86
+
87
+ response.raise_for_status()
88
+ poll_success = wait_for_app_port_to_be_accessible(
89
+ api_url,
90
+ metaflow_token,
91
+ workstation_id,
92
+ name,
93
+ APP_READY_POLL_TIMEOUT_SECONDS,
94
+ )
95
+ if poll_success:
96
+ print(f"App {name} started on port {port}!")
97
+ print(
98
+ f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
99
+ )
100
+ print(
101
+ f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
102
+ )
103
+ else:
104
+ raise MetaflowException(
105
+ f"The app could not be deployed in {APP_READY_POLL_TIMEOUT_SECONDS / 60} minutes. Please try again later."
106
+ )
107
+ except Exception:
108
+ raise MetaflowException(
109
+ f"Failed to start app {name} on port {port}!"
110
+ )
111
+ except Exception as e:
112
+ raise MetaflowException(f"Failed to start app {name} on port {port}!")
113
+
114
+
115
+ def ensure_app_start_request_is_valid(existing_named_ports, port: int, name: str):
116
+ """
117
+ Ensures that the port number is available on the workstation and that an app of
118
+ the same name is not already opened on a different port.
119
+
120
+ Args:
121
+ existing_named_ports: A list of named ports on the workstation.
122
+ port: The port number to check.
123
+ name: The name of the app to check.
124
+ """
125
+ existing_apps_by_port = {np["port"]: np for np in existing_named_ports}
126
+
127
+ if port not in existing_apps_by_port:
128
+ raise MetaflowException(f"Port {port} not found on workstation")
129
+
130
+ for existing_named_port in existing_named_ports:
131
+ if (
132
+ name == existing_named_port["name"]
133
+ and existing_named_port["port"] != port
134
+ and existing_named_port["enabled"]
135
+ ):
136
+ raise MetaflowException(
137
+ f"App with name '{name}' is already deployed on port {existing_named_port['port']}"
138
+ )
139
+
140
+
141
+ def wait_for_app_port_to_be_accessible(
142
+ api_url, metaflow_token, workstation_id, app_name, poll_timeout_seconds
143
+ ) -> bool:
144
+ """
145
+ Waits for the app to be ready by polling the workstation status.
146
+ """
147
+ num_retries_per_request = 3
148
+ start_time = time.time()
149
+ retry_delay = 1.0
150
+ poll_interval = 10
151
+ wait_message = f"App {app_name} is currently being deployed..."
152
+ while time.time() - start_time < poll_timeout_seconds:
153
+ for _ in range(num_retries_per_request):
154
+ try:
155
+ workstations_response = requests.get(
156
+ f"https://{api_url}/v1/workstations",
157
+ headers={"x-api-key": metaflow_token},
158
+ )
159
+ workstations_response.raise_for_status()
160
+ if is_app_ready(workstations_response.json(), workstation_id, app_name):
161
+ print(wait_message)
162
+ time.sleep(APP_READY_EXTRA_BUFFER_SECONDS)
163
+ return True
164
+ else:
165
+ print(wait_message)
166
+ time.sleep(poll_interval)
167
+ except (
168
+ requests.exceptions.ConnectionError,
169
+ requests.exceptions.ReadTimeout,
170
+ ):
171
+ time.sleep(retry_delay)
172
+ retry_delay *= 2 # Double the delay for the next attempt
173
+ retry_delay += random.uniform(0, 1) # Add jitter
174
+ retry_delay = min(retry_delay, 10)
175
+ return False
176
+
177
+
178
+ def is_app_ready(response_json: dict, workstation_id: str, app_name: str) -> bool:
179
+ """Checks if the app is ready in the given workstation's response."""
180
+ workstations = response_json.get("workstations", [])
181
+ for workstation in workstations:
182
+ if workstation.get("instance_id") == workstation_id:
183
+ hosted_apps = workstation.get("status", {}).get("hosted_apps", [])
184
+ for hosted_app in hosted_apps:
185
+ if hosted_app.get("name") == app_name:
186
+ return bool(hosted_app.get("ready"))
187
+ return False
@@ -0,0 +1,3 @@
1
+ DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START = 10
2
+ BASE_DIR_FOR_APP_ASSETS = "/home/ob-workspace/.appdaemon/apps/"
3
+ APP_DAEMON_WORKSTAION_PATH = "/home/ob-workspace/.appdaemon"
@@ -0,0 +1,15 @@
1
+ from . import config
2
+ from . import dependencies
3
+ from . import capsule
4
+ from . import utils
5
+ from . import app_config
6
+ from . import code_package
7
+ from .deployer import AppDeployer, bake_image, package_code, DeployedApp
8
+ from .config import BakedImage, PackagedCode
9
+ from .config.typed_configs import (
10
+ ReplicaConfigDict,
11
+ ResourceConfigDict,
12
+ AuthConfigDict,
13
+ DependencyConfigDict,
14
+ PackageConfigDict,
15
+ )