ob-metaflow-extensions 1.1.123rc0__py2.py3-none-any.whl → 1.1.123rc2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -4,8 +4,13 @@ from metaflow import current
4
4
  from .app_utils import start_app
5
5
  from .supervisord_utils import SupervisorClient, SupervisorClientException
6
6
  import os
7
+ import random
8
+ import string
9
+ import tempfile
10
+ import sys
7
11
 
8
12
  DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START = 10
13
+ BASE_DIR_FOR_APP_ASSETS = "/home/ob-workspace/.appdaemon/apps/"
9
14
 
10
15
 
11
16
  class WorkstationAppDeployDecorator(StepDecorator):
@@ -31,6 +36,19 @@ class WorkstationAppDeployDecorator(StepDecorator):
31
36
  "Nvidia. Please use one or the other.".format(step=step)
32
37
  )
33
38
 
39
+ # We always need to have some environment defined through the flow to deploy and app.
40
+ # Which means either step decorators like @pypi / @conda must be defined.
41
+ # or flow level decorators like @conda_base / @pypi_base.
42
+ if not any([deco.name == "pypi" or deco.name == "conda" for deco in decos]):
43
+ flow_decorators = flow._flow_decorators.keys()
44
+ if (
45
+ "conda_base" not in flow_decorators
46
+ and "pypi_base" not in flow_decorators
47
+ ):
48
+ raise MetaflowException(
49
+ "@app_deploy requires either step decorators like @pypi / @conda or flow level decorators like @conda_base / @pypi_base to be defined."
50
+ )
51
+
34
52
  app_port = self.attributes["app_port"]
35
53
  app_name = self.attributes["app_name"]
36
54
 
@@ -43,29 +61,74 @@ class WorkstationAppDeployDecorator(StepDecorator):
43
61
  if app_name is None:
44
62
  raise MetaflowException("AppDeployDecorator requires app_name to be set.")
45
63
 
64
+ def task_pre_step(
65
+ self,
66
+ step_name,
67
+ task_datastore,
68
+ metadata,
69
+ run_id,
70
+ task_id,
71
+ flow,
72
+ graph,
73
+ retry_count,
74
+ max_user_code_retries,
75
+ ubf_context,
76
+ inputs,
77
+ ):
78
+ os.makedirs(BASE_DIR_FOR_APP_ASSETS, exist_ok=True)
79
+ # First we want to create a directory where the user's app directory and artifacts can be stored.
80
+ with tempfile.TemporaryDirectory(
81
+ prefix=BASE_DIR_FOR_APP_ASSETS, delete=False
82
+ ) as temp_dir:
83
+ launch_temp_dir = temp_dir
84
+
85
+ # Expose this to the user, so that they can use it write their artifacts.
86
+ setattr(flow, "deploy_dir", launch_temp_dir)
87
+
88
+ # Make sure to record deploy_dir so that the user cannot accidentally override it.
89
+ self._deploy_dir = launch_temp_dir
90
+
46
91
  def task_post_step(
47
92
  self, step_name, flow, graph, retry_count, max_user_code_retries
48
93
  ):
49
- entrypoint = getattr(flow, "entrypoint", None)
94
+ deploy_dir = self._deploy_dir
95
+
96
+ # By default we assume that the user has a __main__.py file in their app directory.
97
+ # They can always override this behavior.
98
+ user_provided_entrypoint = getattr(flow, "entrypoint", None)
99
+
100
+ if user_provided_entrypoint is not None and not isinstance(
101
+ user_provided_entrypoint, str
102
+ ):
103
+ raise MetaflowException(
104
+ f"@app_deploy requires entrypoint to be set to a string. The current value of entrypoint {user_provided_entrypoint} is not valid."
105
+ )
106
+
107
+ flow_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
108
+
109
+ # By default, we assume that the layout of the flow directory is:
110
+ # flow_dir/
111
+ # - deployer_flow.py
112
+ # - my_custom_app/
113
+ # - __main__.py
114
+ # - other_files
115
+ # - other_dirs/
116
+ # This can be overridden by the user by setting the app_dir attribute.
117
+ # None of this matters if the user provides a custom entrypoint, since in that case we don't copy
118
+ # anything anywhere.
119
+ app_location = getattr(
120
+ flow, "app_dir", os.path.join(flow_directory, self.attributes["app_name"])
121
+ )
122
+
123
+ if user_provided_entrypoint is None and not os.path.exists(app_location):
124
+ raise MetaflowException(f"App directory {app_location} does not exist.")
125
+
50
126
  wait_time_for_app_start = getattr(
51
127
  flow,
52
128
  "wait_time_for_app_start",
53
129
  DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START,
54
130
  )
55
131
 
56
- if entrypoint is None or not isinstance(entrypoint, str):
57
- raise MetaflowException(
58
- f"@app_deploy requires entrypoint to be set to a string. The current value of entrypoint {entrypoint} is not valid."
59
- )
60
-
61
- launch_dir = getattr(flow, "launch_dir", None)
62
- if launch_dir is None or launch_dir == "":
63
- raise MetaflowException("@app_deploy requires launch_dir to be set.")
64
- elif not isinstance(launch_dir, str) or not os.path.exists(launch_dir):
65
- raise MetaflowException(
66
- f"@app_deploy requires launch_dir to be set to a valid directory. The current value of launch_dir {launch_dir} is not valid."
67
- )
68
-
69
132
  try:
70
133
  supervisor_client = SupervisorClient(
71
134
  wait_time_seconds_for_app_start=wait_time_for_app_start
@@ -79,8 +142,10 @@ class WorkstationAppDeployDecorator(StepDecorator):
79
142
  # Now, let's add the app to supervisor.
80
143
  supervisor_client.start_process_with_supervisord(
81
144
  self.attributes["app_name"],
82
- entrypoint,
83
- launch_dir,
145
+ self.attributes["app_port"],
146
+ user_provided_entrypoint,
147
+ deploy_dir,
148
+ app_location,
84
149
  )
85
150
  except SupervisorClientException as e:
86
151
  raise MetaflowException(str(e))
@@ -50,8 +50,74 @@ class SupervisorClient:
50
50
  "This workstation does not support deploying apps! Please reach out to Outerbounds for support."
51
51
  )
52
52
 
53
- def start_process_with_supervisord(self, app_name, entrypoint, directory=None):
54
- """Add a new program entry to supervisor configuration."""
53
+ def _stop_existing_app_at_port(self, app_port):
54
+ supervisor_config = configparser.ConfigParser()
55
+ supervisor_config.read(self.supervisor_conf_loc)
56
+
57
+ for program in supervisor_config.sections():
58
+ if "obp_app_port" in supervisor_config[program]:
59
+ if supervisor_config[program]["obp_app_port"].strip() == str(app_port):
60
+ res = subprocess.run(
61
+ ["supervisorctl", "stop", program],
62
+ stdout=subprocess.DEVNULL,
63
+ stderr=subprocess.DEVNULL,
64
+ )
65
+
66
+ del supervisor_config[program]
67
+
68
+ with tempfile.NamedTemporaryFile(
69
+ "w", dir=os.path.dirname(self.supervisor_conf_loc), delete=False
70
+ ) as f:
71
+ supervisor_config.write(f)
72
+ tmp_file = f.name
73
+
74
+ os.rename(tmp_file, self.supervisor_conf_loc)
75
+
76
+ def start_process_with_supervisord(
77
+ self,
78
+ app_name,
79
+ app_port,
80
+ user_provided_entrypoint,
81
+ deploy_dir=None,
82
+ app_dir=None,
83
+ ):
84
+ """
85
+ Add a new program entry to supervisor configuration.
86
+
87
+ Args:
88
+ app_name: The name of the app to start.
89
+ entrypoint: The entrypoint to start the app with.
90
+ directory: The directory to run the app in.
91
+ deploy_dir: The directory to copy the app to and deploy from.
92
+ app_dir: The directory to copy the app from.
93
+ """
94
+
95
+ entrypoint = user_provided_entrypoint
96
+ deploy_dir_for_port = "/home/ob-workspace/.appdaemon/apps/6000"
97
+ launch_directory = (
98
+ "/home/ob-workspace/.appdaemon/apps"
99
+ if entrypoint is None
100
+ else "/home/ob-workspace/.appdaemon"
101
+ )
102
+
103
+ # Step 1: Stop any existing apps that are running on the same port.
104
+ self._stop_existing_app_at_port(app_port)
105
+
106
+ if user_provided_entrypoint is None:
107
+ # Step 2: Copy the app_dir to the deploy_dir.
108
+ recursive_copy(app_dir, deploy_dir)
109
+
110
+ # Step 3: Copy the entire deploy_dir to the port specific directory.
111
+ if os.path.exists(deploy_dir_for_port):
112
+ shutil.rmtree(deploy_dir_for_port)
113
+
114
+ os.makedirs(deploy_dir_for_port)
115
+ recursive_copy(deploy_dir, deploy_dir_for_port)
116
+
117
+ # Apply default value
118
+ entrypoint = f"-m {str(app_port)}"
119
+
120
+ shutil.rmtree(deploy_dir)
55
121
 
56
122
  persistent_path_for_executable = (
57
123
  self.persist_metaflow_generated_python_environment()
@@ -61,9 +127,10 @@ class SupervisorClient:
61
127
 
62
128
  entry = {
63
129
  "command": command,
64
- "directory": directory or os.getcwd(),
130
+ "directory": launch_directory,
65
131
  "autostart": "true",
66
132
  "autorestart": "true",
133
+ "obp_app_port": app_port, # Record the app port for internal reference. This is not used by supervisor.
67
134
  }
68
135
 
69
136
  supervisor_config = configparser.ConfigParser()
@@ -76,6 +143,7 @@ class SupervisorClient:
76
143
  ) as f:
77
144
  supervisor_config.write(f)
78
145
  tmp_file = f.name
146
+
79
147
  os.rename(tmp_file, self.supervisor_conf_loc)
80
148
 
81
149
  # Execute supervisorctl reload
@@ -156,12 +224,16 @@ class SupervisorClient:
156
224
 
157
225
  os.makedirs(persistent_path_for_this_environment, exist_ok=True)
158
226
 
159
- for item in os.listdir(environment_path):
160
- src_path = os.path.join(environment_path, item)
161
- dst_path = os.path.join(persistent_path_for_this_environment, item)
162
- if os.path.isdir(src_path):
163
- shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
164
- else:
165
- shutil.copy2(src_path, dst_path)
227
+ recursive_copy(environment_path, persistent_path_for_this_environment)
166
228
 
167
229
  return final_executable_path
230
+
231
+
232
+ def recursive_copy(src, dst):
233
+ for item in os.listdir(src):
234
+ s = os.path.join(src, item)
235
+ d = os.path.join(dst, item)
236
+ if os.path.isdir(s):
237
+ shutil.copytree(s, d, dirs_exist_ok=True)
238
+ else:
239
+ shutil.copy2(s, d)
@@ -1,8 +1,9 @@
1
1
  import json
2
2
  import os
3
+ import time
3
4
  import threading
4
- from urllib.parse import urlparse
5
5
  from urllib.request import HTTPError, Request, URLError, urlopen
6
+ from functools import wraps
6
7
 
7
8
  from metaflow import util
8
9
  from metaflow.mflog import (
@@ -12,7 +13,6 @@ from metaflow.mflog import (
12
13
  tail_logs,
13
14
  get_log_tailer,
14
15
  )
15
- import requests
16
16
  from .exceptions import NvcfJobFailedException, NvcfPollingConnectionError
17
17
 
18
18
  # Redirect structured logs to $PWD/.logs/
@@ -23,6 +23,35 @@ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
23
23
  STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
24
24
 
25
25
 
26
+ RETRIABLE_STATUS_CODES = [500]
27
+
28
+
29
+ def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1):
30
+ def decorator(func):
31
+ @wraps(func)
32
+ def wrapper(*args, **kwargs):
33
+ retries = 0
34
+ while retries <= max_retries:
35
+ try:
36
+ return func(*args, **kwargs)
37
+ except HTTPError as e:
38
+ if e.code in status_codes and retries < max_retries:
39
+ retries += 1
40
+ print(
41
+ f"[@nvidia] Received {e.code} error, retrying ({retries}/{max_retries})..."
42
+ )
43
+ time.sleep(delay)
44
+ continue
45
+ raise
46
+ except Exception:
47
+ raise
48
+ return func(*args, **kwargs)
49
+
50
+ return wrapper
51
+
52
+ return decorator
53
+
54
+
26
55
  class Nvcf(object):
27
56
  def __init__(self, metadata, datastore, environment, function_id, ngc_api_key):
28
57
  self.metadata = metadata
@@ -220,7 +249,11 @@ class Job(object):
220
249
  @property
221
250
  def status(self):
222
251
  if self._status not in [JobStatus.SUCCESSFUL, JobStatus.FAILED]:
223
- self._poll()
252
+ try:
253
+ self._poll()
254
+ except (HTTPError, URLError) as e:
255
+ self._status = JobStatus.FAILED
256
+ raise NvcfPollingConnectionError(e)
224
257
  return self._status
225
258
 
226
259
  @property
@@ -239,6 +272,7 @@ class Job(object):
239
272
  def result(self):
240
273
  return self._result
241
274
 
275
+ @retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=5)
242
276
  def _poll(self):
243
277
  try:
244
278
  headers = {
@@ -257,11 +291,16 @@ class Job(object):
257
291
  else:
258
292
  self._status = JobStatus.FAILED
259
293
  self._result = data
260
- elif response.getcode() in [400, 500]:
261
- self._status = JobStatus.FAILED
262
294
  elif response.getcode() != 202:
263
295
  print(
264
296
  f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
265
297
  )
266
- except (HTTPError, URLError) as e:
267
- raise NvcfPollingConnectionError(e)
298
+ self._status = JobStatus.FAILED
299
+ # 4xx and 5xx responses go in 'except' block
300
+ except HTTPError as e:
301
+ if e.code not in RETRIABLE_STATUS_CODES:
302
+ self._status = JobStatus.FAILED
303
+ raise
304
+ except URLError:
305
+ self._status = JobStatus.FAILED
306
+ raise
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.123rc0
3
+ Version: 1.1.123rc2
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: boto3
9
9
  Requires-Dist: kubernetes
10
- Requires-Dist: ob-metaflow (==2.13.7.1)
10
+ Requires-Dist: ob-metaflow (==2.13.8.1)
11
11
 
12
12
  # Outerbounds platform package
13
13
 
@@ -6,8 +6,8 @@ metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphk
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  metaflow_extensions/outerbounds/plugins/apps/app_utils.py,sha256=JrVKlbRx8-nSmI4cRrB7F8BQGDHleABIYZudK4P-XFE,7905
9
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py,sha256=goD50XtFOrTl8jsHazdYbu6owVzReOL_1YeukOgX75E,3439
10
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py,sha256=1ImWHiQ4GNIyognIg-IT-cpInS1-CKWCEy2h9upZufo,5975
9
+ metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py,sha256=oHCkcXHYIoCi9LujhBsJsktZM44Zkf4d_g4RHLsiW18,5858
10
+ metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py,sha256=QDM7s-iVKnnmE7fM8K-nFoLojQvL_cT8hUj1LF1JOBs,8372
11
11
  metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py,sha256=i6F3FXwvEhkmUCTHDJ4VmSoL6vKyQhC_YRCtY6F4EkA,14209
13
13
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py,sha256=cj63FrdioggipQFP8GwgxU3FYe6IyzjGSUGYxLQZ4nQ,5189
@@ -21,7 +21,7 @@ metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TI
21
21
  metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
22
22
  metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=Wn5WvE_sY-L2jEz-iObMLii5Ds_HQJuE437ufadPFLk,3258
23
23
  metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
24
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=zOIDbN4PhRM2VMHczfoAHUeo1df2UrqWMgTwcppsTwc,8990
24
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=sfWaKZyKuM02v5DujPdfLbm-WoecxHfGn8g432Roct4,10273
25
25
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=5pLEekiw3krlwpcjfjjfUL-URep6soZgmfTqtzLz4Vo,9362
26
26
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=yGv_6EmrBZNiQQP0rEWWE3akAL-KfI3Wd4ZFrcgl3VQ,8663
27
27
  metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
@@ -48,7 +48,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2
48
48
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
49
49
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
50
50
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
51
- ob_metaflow_extensions-1.1.123rc0.dist-info/METADATA,sha256=jVJeDkUJvaWZ3B2Xkor72wjgNKwd1u3LThYzdotjQYs,523
52
- ob_metaflow_extensions-1.1.123rc0.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
53
- ob_metaflow_extensions-1.1.123rc0.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
54
- ob_metaflow_extensions-1.1.123rc0.dist-info/RECORD,,
51
+ ob_metaflow_extensions-1.1.123rc2.dist-info/METADATA,sha256=JYsmhOhSHRHULTOGp9b7wA3UE-KKMg6b4x-9BMcfHGc,523
52
+ ob_metaflow_extensions-1.1.123rc2.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
53
+ ob_metaflow_extensions-1.1.123rc2.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
54
+ ob_metaflow_extensions-1.1.123rc2.dist-info/RECORD,,