ob-metaflow-extensions 1.1.89__py2.py3-none-any.whl → 1.1.91__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -116,40 +116,39 @@ def get_boto3_session(role_arn=None, session_vars=None):
116
116
  if token_info.get("region"):
117
117
  os.environ["AWS_DEFAULT_REGION"] = token_info["region"]
118
118
 
119
- with hide_access_keys():
120
- if cspr_role:
121
- # The generated AWS config will be used here since we set the
122
- # AWS_CONFIG_FILE environment variable above.
123
- if role_arn == USE_CSPR_ROLE_ARN_IF_SET:
124
- # Otherwise start from the default profile, assuming CSPR role
125
- session = boto3.session.Session(profile_name="cspr")
126
- else:
127
- session = boto3.session.Session(profile_name="task")
128
- else:
129
- # Not using AWS config, just AWS_WEB_IDENTITY_TOKEN_FILE + AWS_ROLE_ARN
130
- session = boto3.session.Session()
131
-
132
- if role_arn and role_arn != USE_CSPR_ROLE_ARN_IF_SET:
133
- # If the user provided a role_arn, we assume that role
134
- # using the task role credentials. CSPR role is not used.
135
- fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
136
- client_creator=session._session.create_client,
137
- source_credentials=session._session.get_credentials(),
138
- role_arn=role_arn,
139
- extra_args={},
140
- )
141
- creds = botocore.credentials.DeferredRefreshableCredentials(
142
- method="assume-role", refresh_using=fetcher.fetch_credentials
143
- )
144
- botocore_session = botocore.session.Session(session_vars=session_vars)
145
- botocore_session._credentials = creds
146
- return boto3.session.Session(botocore_session=botocore_session)
119
+ if cspr_role:
120
+ # The generated AWS config will be used here since we set the
121
+ # AWS_CONFIG_FILE environment variable above.
122
+ if role_arn == USE_CSPR_ROLE_ARN_IF_SET:
123
+ # Otherwise start from the default profile, assuming CSPR role
124
+ session = boto3.session.Session(profile_name="cspr")
147
125
  else:
148
- # If the user didn't provide a role_arn, or if the role_arn
149
- # is set to USE_CSPR_ROLE_ARN_IF_SET, we return the default
150
- # session which would use the CSPR role if it is set on the
151
- # server, and the task role otherwise.
152
- return session
126
+ session = boto3.session.Session(profile_name="task")
127
+ else:
128
+ # Not using AWS config, just AWS_WEB_IDENTITY_TOKEN_FILE + AWS_ROLE_ARN
129
+ session = boto3.session.Session()
130
+
131
+ if role_arn and role_arn != USE_CSPR_ROLE_ARN_IF_SET:
132
+ # If the user provided a role_arn, we assume that role
133
+ # using the task role credentials. CSPR role is not used.
134
+ fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
135
+ client_creator=session._session.create_client,
136
+ source_credentials=session._session.get_credentials(),
137
+ role_arn=role_arn,
138
+ extra_args={},
139
+ )
140
+ creds = botocore.credentials.DeferredRefreshableCredentials(
141
+ method="assume-role", refresh_using=fetcher.fetch_credentials
142
+ )
143
+ botocore_session = botocore.session.Session(session_vars=session_vars)
144
+ botocore_session._credentials = creds
145
+ return boto3.session.Session(botocore_session=botocore_session)
146
+ else:
147
+ # If the user didn't provide a role_arn, or if the role_arn
148
+ # is set to USE_CSPR_ROLE_ARN_IF_SET, we return the default
149
+ # session which would use the CSPR role if it is set on the
150
+ # server, and the task role otherwise.
151
+ return session
153
152
 
154
153
 
155
154
  class ObpAuthProvider(object):
@@ -164,11 +163,12 @@ class ObpAuthProvider(object):
164
163
 
165
164
  from botocore.exceptions import ClientError
166
165
 
167
- session = get_boto3_session(role_arn, session_vars)
168
- if with_error:
169
- return session.client(module, **client_params), ClientError
170
- else:
171
- return session.client(module, **client_params)
166
+ with hide_access_keys():
167
+ session = get_boto3_session(role_arn, session_vars)
168
+ if with_error:
169
+ return session.client(module, **client_params), ClientError
170
+ else:
171
+ return session.client(module, **client_params)
172
172
 
173
173
 
174
174
  AWS_CLIENT_PROVIDERS_DESC = [("obp", ".ObpAuthProvider")]
@@ -8,26 +8,27 @@ from datetime import datetime, timezone
8
8
  from metaflow.exception import MetaflowException
9
9
 
10
10
 
11
- def kill_process_and_descendants(pid, termination_timeout=5):
12
- try:
13
- subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
14
- subprocess.check_call(["kill", "-TERM", str(pid)])
15
- except subprocess.CalledProcessError:
16
- pass
11
+ def kill_process_and_descendants(pid, termination_timeout=1, iterations=20, delay=0.5):
12
+ for i in range(iterations):
13
+ try:
14
+ subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
15
+ subprocess.check_call(["kill", "-TERM", str(pid)])
16
+
17
+ time.sleep(termination_timeout)
17
18
 
18
- time.sleep(termination_timeout)
19
+ subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
20
+ subprocess.check_call(["kill", "-KILL", str(pid)])
21
+ except subprocess.CalledProcessError:
22
+ pass
19
23
 
20
- try:
21
- subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
22
- subprocess.check_call(["kill", "-KILL", str(pid)])
23
- except subprocess.CalledProcessError:
24
- pass
24
+ # Don't delay after the last iteration
25
+ if i < iterations - 1:
26
+ time.sleep(delay)
25
27
 
26
28
 
27
29
  class HeartbeatStore(object):
28
30
  def __init__(
29
31
  self,
30
- heartbeat_prefix,
31
32
  main_pid=None,
32
33
  storage_backend=None,
33
34
  emit_frequency=30,
@@ -35,7 +36,6 @@ class HeartbeatStore(object):
35
36
  monitor_frequency=15,
36
37
  max_missed_heartbeats=3,
37
38
  ) -> None:
38
- self.heartbeat_prefix = heartbeat_prefix
39
39
  self.main_pid = main_pid
40
40
  self.storage_backend = storage_backend
41
41
  self.emit_frequency = emit_frequency
@@ -44,8 +44,8 @@ class HeartbeatStore(object):
44
44
  self.max_missed_heartbeats = max_missed_heartbeats
45
45
  self.missed_heartbeats = 0
46
46
 
47
- def emit_heartbeat(self, folder_name=None):
48
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
47
+ def emit_heartbeat(self, heartbeat_prefix: str, folder_name=None):
48
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
49
49
  if folder_name:
50
50
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
51
51
 
@@ -63,8 +63,8 @@ class HeartbeatStore(object):
63
63
 
64
64
  time.sleep(self.emit_frequency)
65
65
 
66
- def emit_tombstone(self, folder_name=None):
67
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
66
+ def emit_tombstone(self, tombstone_prefix: str, folder_name=None):
67
+ tombstone_key = f"{tombstone_prefix}/tombstone"
68
68
  if folder_name:
69
69
  tombstone_key = f"{folder_name}/{tombstone_key}"
70
70
 
@@ -113,12 +113,12 @@ class HeartbeatStore(object):
113
113
  return False
114
114
  return True
115
115
 
116
- def monitor(self, folder_name=None):
117
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
116
+ def monitor(self, heartbeat_prefix: str, tombstone_prefix: str, folder_name=None):
117
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
118
118
  if folder_name:
119
119
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
120
120
 
121
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
121
+ tombstone_key = f"{tombstone_prefix}/tombstone"
122
122
  if folder_name:
123
123
  tombstone_key = f"{folder_name}/{tombstone_key}"
124
124
 
@@ -162,12 +162,17 @@ if __name__ == "__main__":
162
162
  storage = datastores[0](datastore_sysroot)
163
163
 
164
164
  heartbeat_prefix = f"{os.getenv('MF_PATHSPEC')}/{os.getenv('MF_ATTEMPT')}"
165
+ flow_name, run_id, _, _ = os.getenv("MF_PATHSPEC").split("/")
166
+ tombstone_prefix = f"{flow_name}/{run_id}"
165
167
 
166
168
  store = HeartbeatStore(
167
- heartbeat_prefix=heartbeat_prefix,
168
169
  main_pid=int(main_pid),
169
170
  storage_backend=storage,
170
171
  max_missed_heartbeats=int(NVIDIA_HEARTBEAT_THRESHOLD),
171
172
  )
172
173
 
173
- store.monitor(folder_name=folder_name)
174
+ store.monitor(
175
+ heartbeat_prefix=heartbeat_prefix,
176
+ tombstone_prefix=tombstone_prefix,
177
+ folder_name=folder_name,
178
+ )
@@ -185,13 +185,17 @@ class Job(object):
185
185
  )
186
186
 
187
187
  store = HeartbeatStore(
188
- heartbeat_prefix=heartbeat_prefix,
189
188
  main_pid=None,
190
189
  storage_backend=backend,
191
190
  )
192
191
 
193
192
  self.heartbeat_thread = threading.Thread(
194
- target=store.emit_heartbeat, args=("nvcf_heartbeats",), daemon=True
193
+ target=store.emit_heartbeat,
194
+ args=(
195
+ heartbeat_prefix,
196
+ "nvcf_heartbeats",
197
+ ),
198
+ daemon=True,
195
199
  )
196
200
  self.heartbeat_thread.start()
197
201
 
@@ -56,23 +56,21 @@ def list(ctx, run_id):
56
56
  flow_name = ctx.obj.flow.name
57
57
  run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
58
58
  running_invocations = []
59
+
59
60
  for each_step in run_obj:
60
- if (
61
- not each_step.task.finished
62
- and "nvcf-function-id" in each_step.task.metadata_dict
63
- ):
64
- task_pathspec = each_step.task.pathspec
65
- attempt = each_step.task.metadata_dict.get("attempt")
66
- flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
- running_invocations.append(
68
- f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
- )
61
+ for each_task in each_step:
62
+ if not each_task.finished and "nvcf-function-id" in each_task.metadata_dict:
63
+
64
+ task_pathspec = each_task.pathspec
65
+ attempt = each_task.metadata_dict.get("attempt")
66
+ flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
+ running_invocations.append(
68
+ f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
+ )
70
70
 
71
71
  if running_invocations:
72
72
  for each_invocation in running_invocations:
73
73
  ctx.obj.echo(each_invocation)
74
- else:
75
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
76
74
 
77
75
 
78
76
  @nvidia.command(help="Kill steps / tasks running as an nvidia job.")
@@ -88,29 +86,17 @@ def kill(ctx, run_id):
88
86
  HeartbeatStore,
89
87
  )
90
88
 
91
- flow_name = ctx.obj.flow.name
92
- run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
93
-
94
- for each_step in run_obj:
95
- if (
96
- not each_step.task.finished
97
- and "nvcf-function-id" in each_step.task.metadata_dict
98
- ):
99
- task_pathspec = each_step.task.pathspec
100
- attempt = each_step.task.metadata_dict.get("attempt")
101
- heartbeat_prefix = "{task_pathspec}/{attempt}".format(
102
- task_pathspec=task_pathspec, attempt=attempt
103
- )
89
+ datastore_root = ctx.obj.datastore_impl.datastore_root
90
+ store = HeartbeatStore(
91
+ main_pid=None,
92
+ storage_backend=ctx.obj.datastore_impl(datastore_root),
93
+ )
104
94
 
105
- datastore_root = ctx.obj.datastore_impl.datastore_root
106
- store = HeartbeatStore(
107
- heartbeat_prefix=heartbeat_prefix,
108
- main_pid=None,
109
- storage_backend=ctx.obj.datastore_impl(datastore_root),
110
- )
111
- store.emit_tombstone(folder_name="nvcf_heartbeats")
112
- else:
113
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
95
+ flow_name = ctx.obj.flow.name
96
+ tombstone_prefix = f"{flow_name}/{run_id}"
97
+ store.emit_tombstone(
98
+ tombstone_prefix=tombstone_prefix, folder_name="nvcf_heartbeats"
99
+ )
114
100
 
115
101
 
116
102
  @nvidia.command(
@@ -31,7 +31,7 @@ from metaflow.mflog import (
31
31
  )
32
32
 
33
33
  from .snowpark_client import SnowparkClient
34
- from .snowpark_exceptions import SnowparkException
34
+ from .snowpark_exceptions import SnowparkException, SnowparkKilledException
35
35
  from .snowpark_job import SnowparkJob
36
36
 
37
37
  # Redirect structured logs to $PWD/.logs/
@@ -291,7 +291,7 @@ class Snowpark(object):
291
291
  else:
292
292
  if self.job.is_running:
293
293
  # Kill the job if it is still running by throwing an exception.
294
- raise SnowparkException("Task failed!")
294
+ raise SnowparkKilledException("Task failed!")
295
295
  echo(
296
296
  "Task finished with message '%s'." % self.job.message,
297
297
  "stderr",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.89
3
+ Version: 1.1.91
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -1,7 +1,7 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=Zpfpjgz68_ZgxlXezjzlsDLo4840rkWuZgwDB_5H57U,4059
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=9DCqKsb2bPfw1f7x-3EuB2Mqc9uKlLtGEG6yLJI8Xx0,12510
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=WwvFcN5kserbPwhrE4hXprnXzJxonEPT0Mlik2kmGMA,12406
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=1v2GBqoMBxp5E7Lejz139w-jxJtPnLDvvHXP0HhEIHI,2361
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -14,12 +14,12 @@ metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=g
14
14
  metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=GVnvSTjqYVj5oG2yh8KJFt7iZ33cEadDD5HbdmC9hJ0,1457
15
15
  metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=SWieODDxtIaeZwdMYtObDi57Kjyfw2DUuE6pJtU750w,9206
16
16
  metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=4aQZ0kpW2LlJbHx6AG4A9eaFLH9rWC_ENWnnfYNq1qk,5910
18
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=NIt1kJHuYpnCF7n73A90ZITWsk5QWtsbiHfzvdVjgqk,8997
19
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=9nQBwm6AYtaKIAxdb937MOnsut3INEXN3v5eSnXy4cg,9811
17
+ metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
18
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=s9S_Ntm8Y23fgzQtN_hLMrQaSfOvKqNBUyL1K8Xo9vU,9039
19
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=YZb5AvbVgUwUJVxRxQ4JqqP8e1RMJr6dZ9U4KkHE-M8,9134
20
20
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=E7h94ni8yW9BQkKSBUptPdGAaVmXpR9FlXkPWpLyPd0,6054
21
21
  metaflow_extensions/outerbounds/plugins/snowpark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py,sha256=vzgpVLCKvHjzHNfJvmH0jcxefYNsVggw_vof_y_U_a8,10643
22
+ metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py,sha256=0R8aFN9MpgWraqiaI6ZF82YpLdFJ1f-3z_-BPRpZfxM,10674
23
23
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py,sha256=ErsVoCQLa33byiykOQzDEeEkRKk0mgffZme43f3jxn4,8747
24
24
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py,sha256=JEW0EUxj_mNZXo9OFkJFmWfg-P7_CEgvNbgsMTCBTAE,4273
25
25
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py,sha256=3A9LKg7EarWM8WQ0PTGLUetjxzemQeUiJivvv_4uzr0,9886
@@ -33,7 +33,7 @@ metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,
33
33
  metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
34
34
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
35
35
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
36
- ob_metaflow_extensions-1.1.89.dist-info/METADATA,sha256=48w-ipRZMJOxQ7yZx8uqyDmiBQIirZ3MWYtygEXscNI,520
37
- ob_metaflow_extensions-1.1.89.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
38
- ob_metaflow_extensions-1.1.89.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
39
- ob_metaflow_extensions-1.1.89.dist-info/RECORD,,
36
+ ob_metaflow_extensions-1.1.91.dist-info/METADATA,sha256=rTZDEBQQ5bWEZnqLoralSTjginYiJ-Q81f054qg3vLs,520
37
+ ob_metaflow_extensions-1.1.91.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
38
+ ob_metaflow_extensions-1.1.91.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
39
+ ob_metaflow_extensions-1.1.91.dist-info/RECORD,,