ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -7
- metaflow_extensions/outerbounds/config/__init__.py +35 -0
- metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
- metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
- metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
- metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
- metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
- metaflow_extensions/outerbounds/remote_config.py +53 -16
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
from metaflow.decorators import StepDecorator
|
|
3
|
+
from metaflow import current
|
|
4
|
+
from .app_utils import start_app
|
|
5
|
+
from .supervisord_utils import SupervisorClient, SupervisorClientException
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import string
|
|
9
|
+
import tempfile
|
|
10
|
+
import sys
|
|
11
|
+
from .consts import (
|
|
12
|
+
BASE_DIR_FOR_APP_ASSETS,
|
|
13
|
+
DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
There are 3 variants of starting apps that we support through this function. Which variant is applied is
|
|
18
|
+
a result of the user setting (or not setting) self.entrypoint and self.app_dir.
|
|
19
|
+
|
|
20
|
+
The chosen variant determines whether or not the user will have an easy (auto-magical) way to write their
|
|
21
|
+
Metaflow artifacts the the directory where their app will be run from. This will simplify managing models
|
|
22
|
+
from inside the code.
|
|
23
|
+
|
|
24
|
+
Case 1:
|
|
25
|
+
Desired Behavior:
|
|
26
|
+
The user doesn't care about auto-magical artifact management, they just have a server.py somewhere and
|
|
27
|
+
they want to run it.
|
|
28
|
+
How:
|
|
29
|
+
The user sets self.entrypoint to the name of their file (optionally with any args).
|
|
30
|
+
The value of self.app_dir is irrelevant.
|
|
31
|
+
Example: self.entrypoint = "/home/ob-workspace/my_random_directory/my_subfolder/server.py --my_arg 764"
|
|
32
|
+
Expected Behavior:
|
|
33
|
+
The users app will be started using a conda environment built by Metaflow, that's it.
|
|
34
|
+
|
|
35
|
+
Case 2:
|
|
36
|
+
Desired Behavior:
|
|
37
|
+
The user has defined a clean package/module structure with a __main__.py in a folder somewhere. They would like
|
|
38
|
+
to run this module as an app. They would like to access models in the same top level directory as their app.
|
|
39
|
+
How:
|
|
40
|
+
The user sets self.entrypoint to None.
|
|
41
|
+
The user sets self.app_dir to the top level directory of their app.
|
|
42
|
+
Example: self.app_dir = "/home/ob-workspace/my_random_directory/my_subfolder"
|
|
43
|
+
(my_subfolder HAS to contain a __main__.py file.)
|
|
44
|
+
Expected Behavior:
|
|
45
|
+
The users app will be started using a conda environment built by Metaflow. The user gets access to self.deploy_dir where they
|
|
46
|
+
write their artifacts. After the user writes their artifacts to self.deploy_dir, the artifacts and the app is copied over
|
|
47
|
+
to an internal directory where we will deploy the app from.
|
|
48
|
+
The internal directory is: /home/ob-workspace/.appdaemon/apps/<app_port>.
|
|
49
|
+
<app_port> will contain user's __main__.py and other files (copied recursively), as well as their artifacts that they wrote to
|
|
50
|
+
self.deploy_dir. self.deploy_dir/my_model becomes <app_port>/my_model. my_subfolder/__main__.py becomes <app_port>/__main__.py.
|
|
51
|
+
|
|
52
|
+
Case 3:
|
|
53
|
+
Desired Behavior:
|
|
54
|
+
The user follows the Outerbounds convention, which is the same as Case 2, except the app is actually in the same top level folder
|
|
55
|
+
as the Deployer flow. They would like to access models as usual.
|
|
56
|
+
How:
|
|
57
|
+
The user sets self.entrypoint to None (or doesn't set it at all).
|
|
58
|
+
The user sets self.app_dir to None (or doesn't set it at all).
|
|
59
|
+
Expected Behavior:
|
|
60
|
+
The users app will be started using a conda environment built by Metaflow. Everything else is exactly the same as Case 2.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class WorkstationAppDeployDecorator(StepDecorator):
|
|
65
|
+
"""
|
|
66
|
+
Specifies that this step is used to deploy an instance of the app.
|
|
67
|
+
Requires that self.app_name, self.app_port, self.entrypoint and self.deployDir is set.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
app_port : int
|
|
72
|
+
Number of GPUs to use.
|
|
73
|
+
app_name : str
|
|
74
|
+
Name of the app to deploy.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
name = "app_deploy"
|
|
78
|
+
defaults = {"app_port": 8080, "app_name": "app"}
|
|
79
|
+
|
|
80
|
+
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
81
|
+
if any([deco.name == "kubernetes" for deco in decos]):
|
|
82
|
+
raise MetaflowException(
|
|
83
|
+
"@app_deploy decorator is only supported locally and does not work with remote execution environments like @kubernetes, @nvidia."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# We always need to have some environment defined through the flow to deploy and app.
|
|
87
|
+
# Which means either step decorators like @pypi / @conda must be defined.
|
|
88
|
+
# or flow level decorators like @conda_base / @pypi_base.
|
|
89
|
+
if not any([deco.name == "pypi" or deco.name == "conda" for deco in decos]):
|
|
90
|
+
flow_decorators = flow._flow_decorators.keys()
|
|
91
|
+
if (
|
|
92
|
+
"conda_base" not in flow_decorators
|
|
93
|
+
and "pypi_base" not in flow_decorators
|
|
94
|
+
):
|
|
95
|
+
raise MetaflowException(
|
|
96
|
+
"@app_deploy requires either step decorators like @pypi / @conda or flow level decorators like @conda_base / @pypi_base to be defined."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
app_port = self.attributes["app_port"]
|
|
100
|
+
app_name = self.attributes["app_name"]
|
|
101
|
+
|
|
102
|
+
# Currently this decorator is expected to only execute on workstation.
|
|
103
|
+
if app_port is None or app_port < 6000 or app_port > 6002:
|
|
104
|
+
raise MetaflowException(
|
|
105
|
+
"AppDeployDecorator requires app_port to be between 6000 and 6002."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if app_name is None:
|
|
109
|
+
raise MetaflowException("AppDeployDecorator requires app_name to be set.")
|
|
110
|
+
|
|
111
|
+
def task_pre_step(
|
|
112
|
+
self,
|
|
113
|
+
step_name,
|
|
114
|
+
task_datastore,
|
|
115
|
+
metadata,
|
|
116
|
+
run_id,
|
|
117
|
+
task_id,
|
|
118
|
+
flow,
|
|
119
|
+
graph,
|
|
120
|
+
retry_count,
|
|
121
|
+
max_user_code_retries,
|
|
122
|
+
ubf_context,
|
|
123
|
+
inputs,
|
|
124
|
+
):
|
|
125
|
+
"""
|
|
126
|
+
Runs before the step decorated with @app_deploy has started.
|
|
127
|
+
We create a directory where the user can write their artifacts and expose it via self.deploy_dir.
|
|
128
|
+
"""
|
|
129
|
+
os.makedirs(BASE_DIR_FOR_APP_ASSETS, exist_ok=True)
|
|
130
|
+
# First we want to create a directory where the user's app directory and artifacts can be stored.
|
|
131
|
+
with tempfile.TemporaryDirectory(
|
|
132
|
+
prefix=BASE_DIR_FOR_APP_ASSETS, delete=False
|
|
133
|
+
) as temp_dir:
|
|
134
|
+
launch_temp_dir = temp_dir
|
|
135
|
+
|
|
136
|
+
# Expose this to the user, so that they can use it write their artifacts.
|
|
137
|
+
setattr(flow, "deploy_dir", launch_temp_dir)
|
|
138
|
+
|
|
139
|
+
# Make sure to record deploy_dir so that the user cannot accidentally override it.
|
|
140
|
+
self._deploy_dir = launch_temp_dir
|
|
141
|
+
|
|
142
|
+
def task_post_step(
|
|
143
|
+
self, step_name, flow, graph, retry_count, max_user_code_retries
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Runs after the step decorated with @app_deploy has finished.
|
|
147
|
+
Based on the cases above, things that we care about are self.entrypoint and self.app_dir.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
deploy_dir = self._deploy_dir
|
|
151
|
+
|
|
152
|
+
# By default we assume that the user has a __main__.py file in their app directory.
|
|
153
|
+
# They can always override this behavior.
|
|
154
|
+
user_provided_entrypoint = getattr(flow, "entrypoint", None)
|
|
155
|
+
|
|
156
|
+
if user_provided_entrypoint is not None and not isinstance(
|
|
157
|
+
user_provided_entrypoint, str
|
|
158
|
+
):
|
|
159
|
+
raise MetaflowException(
|
|
160
|
+
f"@app_deploy requires entrypoint to be set to a string. The current value of entrypoint {user_provided_entrypoint} is not valid."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
flow_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
|
|
164
|
+
|
|
165
|
+
app_location = getattr(
|
|
166
|
+
flow, "app_dir", os.path.join(flow_directory, self.attributes["app_name"])
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if user_provided_entrypoint is None and not os.path.exists(app_location):
|
|
170
|
+
raise MetaflowException(f"App directory {app_location} does not exist.")
|
|
171
|
+
|
|
172
|
+
wait_time_for_app_start = getattr(
|
|
173
|
+
flow,
|
|
174
|
+
"wait_time_for_app_start",
|
|
175
|
+
DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
supervisor_client = SupervisorClient(
|
|
180
|
+
wait_time_seconds_for_app_start=wait_time_for_app_start
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# First, let's deploy the app.
|
|
184
|
+
start_app(
|
|
185
|
+
port=self.attributes["app_port"], name=self.attributes["app_name"]
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Now, let's add the app to supervisor.
|
|
189
|
+
supervisor_client.start_process_with_supervisord(
|
|
190
|
+
self.attributes["app_name"],
|
|
191
|
+
self.attributes["app_port"],
|
|
192
|
+
user_provided_entrypoint,
|
|
193
|
+
deploy_dir,
|
|
194
|
+
app_location,
|
|
195
|
+
)
|
|
196
|
+
except SupervisorClientException as e:
|
|
197
|
+
raise MetaflowException(str(e))
|
|
198
|
+
except Exception as e:
|
|
199
|
+
raise MetaflowException(
|
|
200
|
+
f"Failed to start {self.attributes['app_name']}! Cause: {str(e)}"
|
|
201
|
+
) from e
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import configparser
|
|
4
|
+
import tempfile
|
|
5
|
+
import sys
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import shutil
|
|
9
|
+
from enum import Enum
|
|
10
|
+
import time
|
|
11
|
+
from .consts import BASE_DIR_FOR_APP_ASSETS, APP_DAEMON_WORKSTAION_PATH
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SupervisorClientException(Exception):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SupervisorClient:
|
|
19
|
+
"""
|
|
20
|
+
A client for starting and stopping apps using supervisor.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, wait_time_seconds_for_app_start: int):
|
|
24
|
+
self.supervisor_conf_loc = os.environ.get("SUPERVISOR_CONF_PATH")
|
|
25
|
+
|
|
26
|
+
self.wait_time_seconds_for_app_start = wait_time_seconds_for_app_start
|
|
27
|
+
if self.supervisor_conf_loc is None or not os.path.exists(
|
|
28
|
+
self.supervisor_conf_loc
|
|
29
|
+
):
|
|
30
|
+
raise SupervisorClientException(
|
|
31
|
+
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.metaflow_envs_persistent_path = os.environ.get(
|
|
35
|
+
"SUPERVISOR_PYTHON_ENVS_PATH"
|
|
36
|
+
)
|
|
37
|
+
if self.metaflow_envs_persistent_path is None:
|
|
38
|
+
raise SupervisorClientException(
|
|
39
|
+
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Check if supervisorctl is installed
|
|
43
|
+
if not shutil.which("supervisorctl"):
|
|
44
|
+
raise SupervisorClientException(
|
|
45
|
+
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _stop_existing_app_at_port(self, app_port):
|
|
49
|
+
supervisor_config = configparser.ConfigParser()
|
|
50
|
+
supervisor_config.read(self.supervisor_conf_loc)
|
|
51
|
+
|
|
52
|
+
for program in supervisor_config.sections():
|
|
53
|
+
if "obp_app_port" in supervisor_config[program]:
|
|
54
|
+
if supervisor_config[program]["obp_app_port"].strip() == str(app_port):
|
|
55
|
+
res = subprocess.run(
|
|
56
|
+
["supervisorctl", "stop", program],
|
|
57
|
+
stdout=subprocess.DEVNULL,
|
|
58
|
+
stderr=subprocess.DEVNULL,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
del supervisor_config[program]
|
|
62
|
+
|
|
63
|
+
with tempfile.NamedTemporaryFile(
|
|
64
|
+
"w", dir=os.path.dirname(self.supervisor_conf_loc), delete=False
|
|
65
|
+
) as f:
|
|
66
|
+
supervisor_config.write(f)
|
|
67
|
+
tmp_file = f.name
|
|
68
|
+
|
|
69
|
+
os.rename(tmp_file, self.supervisor_conf_loc)
|
|
70
|
+
|
|
71
|
+
def _create_supervisor_conf_entry(
|
|
72
|
+
self, command, launch_directory, app_port, app_name
|
|
73
|
+
):
|
|
74
|
+
entry = {
|
|
75
|
+
"command": command,
|
|
76
|
+
"directory": launch_directory,
|
|
77
|
+
"autostart": "true",
|
|
78
|
+
"autorestart": "true",
|
|
79
|
+
"obp_app_port": app_port, # Record the app port for internal reference. This is not used by supervisor.
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
supervisor_config = configparser.ConfigParser()
|
|
83
|
+
supervisor_config.read(self.supervisor_conf_loc)
|
|
84
|
+
|
|
85
|
+
supervisor_config[f"program:{app_name}"] = entry
|
|
86
|
+
|
|
87
|
+
with tempfile.NamedTemporaryFile(
|
|
88
|
+
"w", dir=os.path.dirname(self.supervisor_conf_loc), delete=False
|
|
89
|
+
) as f:
|
|
90
|
+
supervisor_config.write(f)
|
|
91
|
+
tmp_file = f.name
|
|
92
|
+
|
|
93
|
+
os.rename(tmp_file, self.supervisor_conf_loc)
|
|
94
|
+
|
|
95
|
+
def start_process_with_supervisord(
|
|
96
|
+
self,
|
|
97
|
+
app_name,
|
|
98
|
+
app_port,
|
|
99
|
+
user_provided_entrypoint,
|
|
100
|
+
deploy_dir=None,
|
|
101
|
+
app_dir=None,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Starts the app using supervisor.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
app_name: The name of the app to start.
|
|
108
|
+
app_port: The port to start the app on.
|
|
109
|
+
user_provided_entrypoint: The entrypoint to start the app with.
|
|
110
|
+
deploy_dir: The directory to copy the app to and deploy from.
|
|
111
|
+
app_dir: The directory to copy the app from.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
entrypoint = user_provided_entrypoint
|
|
115
|
+
deploy_dir_for_port = os.path.join(BASE_DIR_FOR_APP_ASSETS, str(app_port))
|
|
116
|
+
launch_directory = (
|
|
117
|
+
BASE_DIR_FOR_APP_ASSETS
|
|
118
|
+
if entrypoint is None
|
|
119
|
+
else APP_DAEMON_WORKSTAION_PATH
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Stop any existing apps that are running on the same port.
|
|
123
|
+
self._stop_existing_app_at_port(app_port)
|
|
124
|
+
|
|
125
|
+
# This means the user has opted for either case 2 or case 3.
|
|
126
|
+
# Cases 2 and 3 are handled the same way, the only thing that differs is self.app_dir.
|
|
127
|
+
if user_provided_entrypoint is None:
|
|
128
|
+
# Copy the app_dir to the deploy_dir.
|
|
129
|
+
# This is also where all (if any) artifacts are written.
|
|
130
|
+
recursive_copy(app_dir, deploy_dir)
|
|
131
|
+
|
|
132
|
+
# Copy the entire deploy_dir to the port specific directory.
|
|
133
|
+
# Clear out anything that was there before (maybe a different app's assets)
|
|
134
|
+
if os.path.exists(deploy_dir_for_port):
|
|
135
|
+
shutil.rmtree(deploy_dir_for_port)
|
|
136
|
+
|
|
137
|
+
os.makedirs(deploy_dir_for_port)
|
|
138
|
+
recursive_copy(deploy_dir, deploy_dir_for_port)
|
|
139
|
+
|
|
140
|
+
# Apply default value
|
|
141
|
+
# We launch the module from BASE_DIR_FOR_APP_ASSETS, so when we provide the -m flag we don't need (and can't use) the full path.
|
|
142
|
+
# We just need to provide the port number (which is also the name of the folder where all app assets are stored)
|
|
143
|
+
entrypoint = f"-m {str(app_port)}"
|
|
144
|
+
|
|
145
|
+
# deploy_dir is meant to be temporary. No need to keep it around after everything has been copied over.
|
|
146
|
+
shutil.rmtree(deploy_dir)
|
|
147
|
+
|
|
148
|
+
# Metaflow by default generates the environment in /root/... (which is not persisted on workstations).
|
|
149
|
+
# Since the environment is fully self contained, we can copy it to a persistent location.
|
|
150
|
+
persistent_path_for_executable = (
|
|
151
|
+
self._persist_metaflow_generated_python_environment()
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# This is the command used by supervisord to launch the app.
|
|
155
|
+
command = f"{persistent_path_for_executable} {entrypoint}"
|
|
156
|
+
|
|
157
|
+
self._create_supervisor_conf_entry(
|
|
158
|
+
command, launch_directory, app_port, app_name
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Execute supervisorctl reload
|
|
162
|
+
# Capture the exit code
|
|
163
|
+
exit_code = subprocess.run(
|
|
164
|
+
["supervisorctl", "reload"],
|
|
165
|
+
stdout=subprocess.DEVNULL,
|
|
166
|
+
stderr=subprocess.DEVNULL,
|
|
167
|
+
).returncode
|
|
168
|
+
if exit_code != 0:
|
|
169
|
+
raise SupervisorClientException(
|
|
170
|
+
"Failed to start app! Contact Outerbounds for support."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
print(
|
|
174
|
+
f"Waiting for {self.wait_time_seconds_for_app_start} seconds for {app_name} to start..."
|
|
175
|
+
)
|
|
176
|
+
time.sleep(self.wait_time_seconds_for_app_start)
|
|
177
|
+
|
|
178
|
+
self._raise_on_bad_status(app_name, command)
|
|
179
|
+
|
|
180
|
+
def _get_launched_prcoess_status(self, app_name, debug_command):
|
|
181
|
+
"""
|
|
182
|
+
Checks the status of the launched process. If the status is not RUNNING or STARTING, it raises an exception.
|
|
183
|
+
Possible statuses: RUNNING, STARTING, STOPPED, BACKOFF, STOPPING, EXITED, FATAL, UNKNOWN
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
app_name: The name of the app to check the status of.
|
|
187
|
+
debug_command: The command to run to debug the app.
|
|
188
|
+
"""
|
|
189
|
+
status_cmd_output = subprocess.run(
|
|
190
|
+
["supervisorctl", "status", app_name],
|
|
191
|
+
stdout=subprocess.PIPE,
|
|
192
|
+
stderr=subprocess.PIPE,
|
|
193
|
+
).stdout.decode("utf-8")
|
|
194
|
+
|
|
195
|
+
status_cmd_output_parts = [
|
|
196
|
+
x.strip() for x in status_cmd_output.split(" ") if x.strip()
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
status_str = status_cmd_output_parts[1]
|
|
200
|
+
|
|
201
|
+
if not status_str == "RUNNING" and not status_str == "STARTING":
|
|
202
|
+
raise SupervisorClientException(
|
|
203
|
+
f"Failed to start {app_name}! Try running {debug_command} manually to debug."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def _persist_metaflow_generated_python_environment(self):
|
|
207
|
+
"""
|
|
208
|
+
Persists the metaflow generated python environment to a persistent location.
|
|
209
|
+
The step already runs in the environment generated by Metaflow.
|
|
210
|
+
"""
|
|
211
|
+
current_executable = sys.executable
|
|
212
|
+
environment_path = Path(current_executable).parent.parent
|
|
213
|
+
|
|
214
|
+
persistent_path_for_this_environment = os.path.join(
|
|
215
|
+
self.metaflow_envs_persistent_path,
|
|
216
|
+
environment_path.parent.name,
|
|
217
|
+
environment_path.name,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
final_executable_path = os.path.join(
|
|
221
|
+
persistent_path_for_this_environment,
|
|
222
|
+
Path(current_executable).parent.name,
|
|
223
|
+
Path(current_executable).name,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if os.path.exists(final_executable_path):
|
|
227
|
+
return final_executable_path
|
|
228
|
+
|
|
229
|
+
os.makedirs(persistent_path_for_this_environment, exist_ok=True)
|
|
230
|
+
|
|
231
|
+
recursive_copy(environment_path, persistent_path_for_this_environment)
|
|
232
|
+
|
|
233
|
+
return final_executable_path
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def recursive_copy(src, dst):
|
|
237
|
+
for item in os.listdir(src):
|
|
238
|
+
s = os.path.join(src, item)
|
|
239
|
+
d = os.path.join(dst, item)
|
|
240
|
+
if os.path.isdir(s):
|
|
241
|
+
shutil.copytree(s, d, dirs_exist_ok=True)
|
|
242
|
+
else:
|
|
243
|
+
shutil.copy2(s, d)
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from urllib.parse import urlparse
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
|
+
import time
|
|
5
|
+
import random
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
def read_mf_config():
|
|
@@ -35,12 +39,28 @@ def get_token_url_and_headers(url_path):
|
|
|
35
39
|
def get_token(url_path):
|
|
36
40
|
from metaflow.exception import MetaflowException
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
retry_delay = 1
|
|
43
|
+
max_retries = os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
|
|
44
|
+
attempt = 0
|
|
45
|
+
while attempt < int(max_retries):
|
|
46
|
+
attempt = attempt + 1
|
|
47
|
+
try:
|
|
48
|
+
url, headers = get_token_url_and_headers(url_path)
|
|
49
|
+
r = requests.get(url, headers=headers)
|
|
50
|
+
r.raise_for_status()
|
|
51
|
+
token_info = r.json()
|
|
52
|
+
return token_info
|
|
53
|
+
except requests.exceptions.HTTPError as e:
|
|
54
|
+
raise MetaflowException(repr(e))
|
|
55
|
+
except (
|
|
56
|
+
requests.exceptions.ConnectionError,
|
|
57
|
+
requests.exceptions.ReadTimeout,
|
|
58
|
+
) as e:
|
|
59
|
+
# ConnectionErrors are generally temporary errors like DNS resolution failures,
|
|
60
|
+
# timeouts etc.
|
|
61
|
+
time.sleep(retry_delay)
|
|
62
|
+
retry_delay *= 2 # Double the delay for the next attempt
|
|
63
|
+
retry_delay += random.uniform(0, 1) # Add jitter
|
|
64
|
+
retry_delay = min(retry_delay, 10)
|
|
44
65
|
|
|
45
|
-
|
|
46
|
-
raise MetaflowException(repr(e))
|
|
66
|
+
raise MetaflowException("error connecting to Outerbounds")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
2
|
+
from metaflow.user_decorators.mutable_step import MutableStep
|
|
3
|
+
from metaflow.user_decorators.user_flow_decorator import FlowMutator
|
|
4
|
+
from .assume_role import OBP_ASSUME_ROLE_ARN_ENV_VAR
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class assume_role(FlowMutator):
|
|
8
|
+
"""
|
|
9
|
+
Flow-level decorator for assuming AWS IAM roles.
|
|
10
|
+
|
|
11
|
+
When applied to a flow, all steps in the flow will automatically use the specified IAM role-arn
|
|
12
|
+
as their source principal.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
------
|
|
16
|
+
@assume_role(role_arn="arn:aws:iam::123456789012:role/my-iam-role")
|
|
17
|
+
class MyFlow(FlowSpec):
|
|
18
|
+
@step
|
|
19
|
+
def start(self):
|
|
20
|
+
import boto3
|
|
21
|
+
client = boto3.client("dynamodb") # Automatically uses the role in the flow decorator
|
|
22
|
+
self.next(self.end)
|
|
23
|
+
|
|
24
|
+
@step
|
|
25
|
+
def end(self):
|
|
26
|
+
from metaflow import get_aws_client
|
|
27
|
+
client = get_aws_client("dynamodb") # Automatically uses the role in the flow decorator
|
|
28
|
+
|
|
29
|
+
You can also filter which steps should use the role:
|
|
30
|
+
@assume_role(role_arn="arn:aws:iam::123456789012:role/my-iam-role", steps=["start", "process"])
|
|
31
|
+
class MyFlow(FlowSpec):
|
|
32
|
+
@step
|
|
33
|
+
def start(self):
|
|
34
|
+
# user code in this step will use the assumed role
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@step
|
|
38
|
+
def process(self):
|
|
39
|
+
# user code in this step will use the assumed role
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@step
|
|
43
|
+
def end(self):
|
|
44
|
+
# user code in this step will NOT use the assumed role
|
|
45
|
+
pass
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def init(self, *args, **kwargs):
|
|
49
|
+
self.role_arn = kwargs.get("role_arn", None)
|
|
50
|
+
self.steps = kwargs.get("steps", None)
|
|
51
|
+
|
|
52
|
+
if self.role_arn is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"`role_arn` keyword argument is required for the assume_role decorator"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if not self.role_arn.startswith("arn:aws:iam::"):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"`role_arn` must be a valid AWS IAM role ARN starting with 'arn:aws:iam::'"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Validate steps parameter
|
|
63
|
+
if self.steps is not None:
|
|
64
|
+
if not isinstance(self.steps, (list, tuple)):
|
|
65
|
+
raise ValueError("`steps` must be a list or tuple of step names")
|
|
66
|
+
if not all(isinstance(s, str) for s in self.steps):
|
|
67
|
+
raise ValueError("All step names in `steps` must be strings")
|
|
68
|
+
|
|
69
|
+
def pre_mutate(self, mutable_flow: MutableFlow) -> None:
|
|
70
|
+
"""
|
|
71
|
+
This method is called by Metaflow to apply the decorator to the flow.
|
|
72
|
+
It sets up environment variables that will be used by the AWS client
|
|
73
|
+
to automatically assume the specified role.
|
|
74
|
+
"""
|
|
75
|
+
# Import environment decorator at runtime to avoid circular imports
|
|
76
|
+
from metaflow import environment
|
|
77
|
+
|
|
78
|
+
# Validate that all specified steps exist in the flow
|
|
79
|
+
if self.steps is not None:
|
|
80
|
+
flow_step_names = {step_name for step_name, _ in mutable_flow.steps}
|
|
81
|
+
specified_steps = set(self.steps)
|
|
82
|
+
missing_steps = specified_steps - flow_step_names
|
|
83
|
+
|
|
84
|
+
if missing_steps:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Step(s) {sorted(missing_steps)} specified in `steps` parameter "
|
|
87
|
+
f"do not exist in the flow. Available steps: {sorted(flow_step_names)}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def _swap_environment_variables(step: MutableStep, role_arn: str) -> None:
|
|
91
|
+
_step_has_env_set = True
|
|
92
|
+
_env_kwargs = {OBP_ASSUME_ROLE_ARN_ENV_VAR: role_arn}
|
|
93
|
+
for d in step.decorator_specs:
|
|
94
|
+
name, _, _, deco_kwargs = d
|
|
95
|
+
if name == "environment":
|
|
96
|
+
_env_kwargs.update(deco_kwargs["vars"])
|
|
97
|
+
_step_has_env_set = True
|
|
98
|
+
|
|
99
|
+
if _step_has_env_set:
|
|
100
|
+
# remove the environment decorator
|
|
101
|
+
step.remove_decorator("environment")
|
|
102
|
+
|
|
103
|
+
# add the environment decorator
|
|
104
|
+
step.add_decorator(
|
|
105
|
+
environment,
|
|
106
|
+
deco_kwargs=dict(vars=_env_kwargs),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Set the role ARN as an environment variable that will be picked up
|
|
110
|
+
# by the get_aws_client function
|
|
111
|
+
def _setup_role_assumption(step: MutableStep) -> None:
|
|
112
|
+
_swap_environment_variables(step, self.role_arn)
|
|
113
|
+
|
|
114
|
+
# Apply the role assumption setup to all steps in the flow (or filtered steps)
|
|
115
|
+
for step_name, step in mutable_flow.steps:
|
|
116
|
+
# If steps filter is specified, only apply to those steps
|
|
117
|
+
if self.steps is None or step_name in self.steps:
|
|
118
|
+
_setup_role_assumption(step)
|
|
File without changes
|