ob-metaflow-extensions 1.1.90__tar.gz → 1.1.92__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +30 -29
- ob-metaflow-extensions-1.1.92/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +157 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +28 -23
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +6 -2
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +20 -34
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +2 -2
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- ob-metaflow-extensions-1.1.92/ob_metaflow_extensions.egg-info/requires.txt +3 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/setup.py +2 -2
- ob-metaflow-extensions-1.1.90/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -61
- ob-metaflow-extensions-1.1.90/ob_metaflow_extensions.egg-info/requires.txt +0 -3
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/README.md +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/setup.cfg +0 -0
|
@@ -116,15 +116,6 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
116
116
|
|
|
117
117
|
self.datastore = [d for d in DATASTORES if d.TYPE == self.datastore_type][0]
|
|
118
118
|
|
|
119
|
-
# Mixing @pypi/@conda in a single step is not supported yet
|
|
120
|
-
for step in self.flow:
|
|
121
|
-
if sum(1 for deco in step.decorators if _is_env_deco(deco)) > 1:
|
|
122
|
-
raise MetaflowException(
|
|
123
|
-
"Mixing and matching PyPI packages and Conda packages within a\n"
|
|
124
|
-
"step is not yet supported. Use one of @pypi or @conda only for the *%s* step."
|
|
125
|
-
% step.name
|
|
126
|
-
)
|
|
127
|
-
|
|
128
119
|
def init_environment(self, echo):
|
|
129
120
|
self.skipped_steps = {
|
|
130
121
|
step.name for step in self.flow if not _step_executes_remotely(step)
|
|
@@ -132,35 +123,45 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
132
123
|
# Attach environment decorator as needed. This is done on a step-by-step basis
|
|
133
124
|
# as we require a conda decorator for fallback steps, but prefer pypi for the baked ones.
|
|
134
125
|
for step in self.flow:
|
|
135
|
-
if
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
126
|
+
if step.name in self.skipped_steps:
|
|
127
|
+
# Conda fallback requires a conda decorator as the default for a step
|
|
128
|
+
decorators._attach_decorators_to_step(step, ["conda"])
|
|
129
|
+
else:
|
|
130
|
+
if not _step_has_environment_deco(step):
|
|
140
131
|
# We default to PyPI for steps that are going to be baked.
|
|
141
132
|
decorators._attach_decorators_to_step(step, ["pypi"])
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
133
|
+
# Initialize the decorator we attached.
|
|
134
|
+
# This is crucial for the conda decorator to work properly in the fallback environment
|
|
135
|
+
for deco in step.decorators:
|
|
136
|
+
if _is_env_deco(deco):
|
|
137
|
+
deco.step_init(
|
|
138
|
+
self.flow,
|
|
139
|
+
None, # not passing graph as it is not available, and not required by conda/pypi decorators
|
|
140
|
+
step.name,
|
|
141
|
+
step.decorators,
|
|
142
|
+
self,
|
|
143
|
+
self.datastore,
|
|
144
|
+
echo,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Mixing @pypi/@conda in a single step is not supported yet
|
|
148
|
+
if sum(1 for deco in step.decorators if _is_env_deco(deco)) > 1:
|
|
149
|
+
raise MetaflowException(
|
|
150
|
+
"Mixing and matching PyPI packages and Conda packages within a\n"
|
|
151
|
+
"step is not yet supported. Use one of @pypi or @conda only for the *%s* step."
|
|
152
|
+
% step.name
|
|
153
|
+
)
|
|
155
154
|
|
|
156
155
|
steps_to_bake = [
|
|
157
|
-
step
|
|
156
|
+
step
|
|
157
|
+
for step in self.flow
|
|
158
|
+
if step.name not in self.skipped_steps and not self.is_disabled(step)
|
|
158
159
|
]
|
|
159
160
|
if steps_to_bake:
|
|
160
161
|
self.logger("🚀 Baking container image(s) ...")
|
|
161
162
|
start_time = time.time()
|
|
162
163
|
self.results = self._bake(steps_to_bake)
|
|
163
|
-
for step in
|
|
164
|
+
for step in steps_to_bake:
|
|
164
165
|
for d in step.decorators:
|
|
165
166
|
if _is_remote_deco(d):
|
|
166
167
|
d.attributes["image"] = self.results[step.name].container_image
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from metaflow.exception import MetaflowException
|
|
7
|
+
from metaflow.metaflow_config import KUBERNETES_NAMESPACE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class KubernetesClientException(MetaflowException):
|
|
14
|
+
headline = "Kubernetes client error"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class KubernetesClient(object):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
try:
|
|
20
|
+
# Kubernetes is a soft dependency.
|
|
21
|
+
from kubernetes import client, config
|
|
22
|
+
except (NameError, ImportError):
|
|
23
|
+
raise KubernetesClientException(
|
|
24
|
+
"Could not import module 'kubernetes'.\n\nInstall Kubernetes "
|
|
25
|
+
"Python package (https://pypi.org/project/kubernetes/) first.\n"
|
|
26
|
+
"You can install the module by executing - "
|
|
27
|
+
"%s -m pip install kubernetes\n"
|
|
28
|
+
"or equivalent through your favorite Python package manager."
|
|
29
|
+
% sys.executable
|
|
30
|
+
)
|
|
31
|
+
self._refresh_client()
|
|
32
|
+
self._namespace = KUBERNETES_NAMESPACE
|
|
33
|
+
|
|
34
|
+
def _refresh_client(self):
|
|
35
|
+
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
36
|
+
from kubernetes import client
|
|
37
|
+
|
|
38
|
+
config = client.Configuration()
|
|
39
|
+
token_info = get_token("/generate/k8s")
|
|
40
|
+
config.host = token_info["endpoint"]
|
|
41
|
+
config.api_key["authorization"] = "Bearer " + token_info["token"]
|
|
42
|
+
config.verify_ssl = False # TODO: FIX THIS
|
|
43
|
+
client.Configuration.set_default(config)
|
|
44
|
+
self._client = client
|
|
45
|
+
self._client_refresh_timestamp = time.time()
|
|
46
|
+
|
|
47
|
+
def get(self):
|
|
48
|
+
if (
|
|
49
|
+
time.time() - self._client_refresh_timestamp
|
|
50
|
+
> CLIENT_REFRESH_INTERVAL_SECONDS
|
|
51
|
+
):
|
|
52
|
+
self._refresh_client()
|
|
53
|
+
|
|
54
|
+
return self._client
|
|
55
|
+
|
|
56
|
+
def _find_active_pods(self, flow_name, run_id=None, user=None):
|
|
57
|
+
def _request(_continue=None):
|
|
58
|
+
# handle paginated responses
|
|
59
|
+
return self._client.CoreV1Api().list_namespaced_pod(
|
|
60
|
+
namespace=self._namespace,
|
|
61
|
+
# limited selector support for K8S api. We want to cover multiple statuses: Running / Pending / Unknown
|
|
62
|
+
field_selector="status.phase!=Succeeded,status.phase!=Failed",
|
|
63
|
+
limit=1000,
|
|
64
|
+
_continue=_continue,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
results = _request()
|
|
68
|
+
|
|
69
|
+
if run_id is not None:
|
|
70
|
+
# handle argo prefixes in run_id
|
|
71
|
+
run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
|
|
72
|
+
|
|
73
|
+
while results.metadata._continue or results.items:
|
|
74
|
+
for pod in results.items:
|
|
75
|
+
match = (
|
|
76
|
+
# arbitrary pods might have no annotations at all.
|
|
77
|
+
pod.metadata.annotations
|
|
78
|
+
and pod.metadata.labels
|
|
79
|
+
and (
|
|
80
|
+
run_id is None
|
|
81
|
+
or (pod.metadata.annotations.get("metaflow/run_id") == run_id)
|
|
82
|
+
# we want to also match pods launched by argo-workflows
|
|
83
|
+
or (
|
|
84
|
+
pod.metadata.labels.get("workflows.argoproj.io/workflow")
|
|
85
|
+
== run_id
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
and (
|
|
89
|
+
user is None
|
|
90
|
+
or pod.metadata.annotations.get("metaflow/user") == user
|
|
91
|
+
)
|
|
92
|
+
and (
|
|
93
|
+
pod.metadata.annotations.get("metaflow/flow_name") == flow_name
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
if match:
|
|
97
|
+
yield pod
|
|
98
|
+
if not results.metadata._continue:
|
|
99
|
+
break
|
|
100
|
+
results = _request(results.metadata._continue)
|
|
101
|
+
|
|
102
|
+
def list(self, flow_name, run_id, user):
|
|
103
|
+
results = self._find_active_pods(flow_name, run_id, user)
|
|
104
|
+
|
|
105
|
+
return list(results)
|
|
106
|
+
|
|
107
|
+
def kill_pods(self, flow_name, run_id, user, echo):
|
|
108
|
+
from kubernetes.stream import stream
|
|
109
|
+
|
|
110
|
+
api_instance = self._client.CoreV1Api()
|
|
111
|
+
job_api = self._client.BatchV1Api()
|
|
112
|
+
pods = self._find_active_pods(flow_name, run_id, user)
|
|
113
|
+
|
|
114
|
+
def _kill_pod(pod):
|
|
115
|
+
echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
|
|
116
|
+
try:
|
|
117
|
+
stream(
|
|
118
|
+
api_instance.connect_get_namespaced_pod_exec,
|
|
119
|
+
name=pod.metadata.name,
|
|
120
|
+
namespace=pod.metadata.namespace,
|
|
121
|
+
command=[
|
|
122
|
+
"/bin/sh",
|
|
123
|
+
"-c",
|
|
124
|
+
"/sbin/killall5",
|
|
125
|
+
],
|
|
126
|
+
stderr=True,
|
|
127
|
+
stdin=False,
|
|
128
|
+
stdout=True,
|
|
129
|
+
tty=False,
|
|
130
|
+
)
|
|
131
|
+
except Exception:
|
|
132
|
+
# best effort kill for pod can fail.
|
|
133
|
+
try:
|
|
134
|
+
job_name = pod.metadata.labels.get("job-name", None)
|
|
135
|
+
if job_name is None:
|
|
136
|
+
raise Exception("Could not determine job name")
|
|
137
|
+
job_api.patch_namespaced_job(
|
|
138
|
+
name=job_name,
|
|
139
|
+
namespace=pod.metadata.namespace,
|
|
140
|
+
field_manager="metaflow",
|
|
141
|
+
body={"spec": {"parallelism": 0}},
|
|
142
|
+
)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
|
|
145
|
+
|
|
146
|
+
with ThreadPoolExecutor() as executor:
|
|
147
|
+
executor.map(_kill_pod, list(pods))
|
|
148
|
+
|
|
149
|
+
def job(self, **kwargs):
|
|
150
|
+
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|
|
151
|
+
|
|
152
|
+
return KubernetesJob(self, **kwargs)
|
|
153
|
+
|
|
154
|
+
def jobset(self, **kwargs):
|
|
155
|
+
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
|
|
156
|
+
|
|
157
|
+
return KubernetesJobSet(self, **kwargs)
|
|
@@ -8,26 +8,27 @@ from datetime import datetime, timezone
|
|
|
8
8
|
from metaflow.exception import MetaflowException
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def kill_process_and_descendants(pid, termination_timeout=5):
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
def kill_process_and_descendants(pid, termination_timeout=1, iterations=20, delay=0.5):
|
|
12
|
+
for i in range(iterations):
|
|
13
|
+
try:
|
|
14
|
+
subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
|
|
15
|
+
subprocess.check_call(["kill", "-TERM", str(pid)])
|
|
16
|
+
|
|
17
|
+
time.sleep(termination_timeout)
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
|
|
20
|
+
subprocess.check_call(["kill", "-KILL", str(pid)])
|
|
21
|
+
except subprocess.CalledProcessError:
|
|
22
|
+
pass
|
|
19
23
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
except subprocess.CalledProcessError:
|
|
24
|
-
pass
|
|
24
|
+
# Don't delay after the last iteration
|
|
25
|
+
if i < iterations - 1:
|
|
26
|
+
time.sleep(delay)
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
class HeartbeatStore(object):
|
|
28
30
|
def __init__(
|
|
29
31
|
self,
|
|
30
|
-
heartbeat_prefix,
|
|
31
32
|
main_pid=None,
|
|
32
33
|
storage_backend=None,
|
|
33
34
|
emit_frequency=30,
|
|
@@ -35,7 +36,6 @@ class HeartbeatStore(object):
|
|
|
35
36
|
monitor_frequency=15,
|
|
36
37
|
max_missed_heartbeats=3,
|
|
37
38
|
) -> None:
|
|
38
|
-
self.heartbeat_prefix = heartbeat_prefix
|
|
39
39
|
self.main_pid = main_pid
|
|
40
40
|
self.storage_backend = storage_backend
|
|
41
41
|
self.emit_frequency = emit_frequency
|
|
@@ -44,8 +44,8 @@ class HeartbeatStore(object):
|
|
|
44
44
|
self.max_missed_heartbeats = max_missed_heartbeats
|
|
45
45
|
self.missed_heartbeats = 0
|
|
46
46
|
|
|
47
|
-
def emit_heartbeat(self, folder_name=None):
|
|
48
|
-
heartbeat_key = f"{
|
|
47
|
+
def emit_heartbeat(self, heartbeat_prefix: str, folder_name=None):
|
|
48
|
+
heartbeat_key = f"{heartbeat_prefix}/heartbeat"
|
|
49
49
|
if folder_name:
|
|
50
50
|
heartbeat_key = f"{folder_name}/{heartbeat_key}"
|
|
51
51
|
|
|
@@ -63,8 +63,8 @@ class HeartbeatStore(object):
|
|
|
63
63
|
|
|
64
64
|
time.sleep(self.emit_frequency)
|
|
65
65
|
|
|
66
|
-
def emit_tombstone(self, folder_name=None):
|
|
67
|
-
tombstone_key = f"{
|
|
66
|
+
def emit_tombstone(self, tombstone_prefix: str, folder_name=None):
|
|
67
|
+
tombstone_key = f"{tombstone_prefix}/tombstone"
|
|
68
68
|
if folder_name:
|
|
69
69
|
tombstone_key = f"{folder_name}/{tombstone_key}"
|
|
70
70
|
|
|
@@ -113,12 +113,12 @@ class HeartbeatStore(object):
|
|
|
113
113
|
return False
|
|
114
114
|
return True
|
|
115
115
|
|
|
116
|
-
def monitor(self, folder_name=None):
|
|
117
|
-
heartbeat_key = f"{
|
|
116
|
+
def monitor(self, heartbeat_prefix: str, tombstone_prefix: str, folder_name=None):
|
|
117
|
+
heartbeat_key = f"{heartbeat_prefix}/heartbeat"
|
|
118
118
|
if folder_name:
|
|
119
119
|
heartbeat_key = f"{folder_name}/{heartbeat_key}"
|
|
120
120
|
|
|
121
|
-
tombstone_key = f"{
|
|
121
|
+
tombstone_key = f"{tombstone_prefix}/tombstone"
|
|
122
122
|
if folder_name:
|
|
123
123
|
tombstone_key = f"{folder_name}/{tombstone_key}"
|
|
124
124
|
|
|
@@ -162,12 +162,17 @@ if __name__ == "__main__":
|
|
|
162
162
|
storage = datastores[0](datastore_sysroot)
|
|
163
163
|
|
|
164
164
|
heartbeat_prefix = f"{os.getenv('MF_PATHSPEC')}/{os.getenv('MF_ATTEMPT')}"
|
|
165
|
+
flow_name, run_id, _, _ = os.getenv("MF_PATHSPEC").split("/")
|
|
166
|
+
tombstone_prefix = f"{flow_name}/{run_id}"
|
|
165
167
|
|
|
166
168
|
store = HeartbeatStore(
|
|
167
|
-
heartbeat_prefix=heartbeat_prefix,
|
|
168
169
|
main_pid=int(main_pid),
|
|
169
170
|
storage_backend=storage,
|
|
170
171
|
max_missed_heartbeats=int(NVIDIA_HEARTBEAT_THRESHOLD),
|
|
171
172
|
)
|
|
172
173
|
|
|
173
|
-
store.monitor(
|
|
174
|
+
store.monitor(
|
|
175
|
+
heartbeat_prefix=heartbeat_prefix,
|
|
176
|
+
tombstone_prefix=tombstone_prefix,
|
|
177
|
+
folder_name=folder_name,
|
|
178
|
+
)
|
|
@@ -185,13 +185,17 @@ class Job(object):
|
|
|
185
185
|
)
|
|
186
186
|
|
|
187
187
|
store = HeartbeatStore(
|
|
188
|
-
heartbeat_prefix=heartbeat_prefix,
|
|
189
188
|
main_pid=None,
|
|
190
189
|
storage_backend=backend,
|
|
191
190
|
)
|
|
192
191
|
|
|
193
192
|
self.heartbeat_thread = threading.Thread(
|
|
194
|
-
target=store.emit_heartbeat,
|
|
193
|
+
target=store.emit_heartbeat,
|
|
194
|
+
args=(
|
|
195
|
+
heartbeat_prefix,
|
|
196
|
+
"nvcf_heartbeats",
|
|
197
|
+
),
|
|
198
|
+
daemon=True,
|
|
195
199
|
)
|
|
196
200
|
self.heartbeat_thread.start()
|
|
197
201
|
|
|
@@ -56,23 +56,21 @@ def list(ctx, run_id):
|
|
|
56
56
|
flow_name = ctx.obj.flow.name
|
|
57
57
|
run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
|
|
58
58
|
running_invocations = []
|
|
59
|
+
|
|
59
60
|
for each_step in run_obj:
|
|
60
|
-
|
|
61
|
-
not
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
61
|
+
for each_task in each_step:
|
|
62
|
+
if not each_task.finished and "nvcf-function-id" in each_task.metadata_dict:
|
|
63
|
+
|
|
64
|
+
task_pathspec = each_task.pathspec
|
|
65
|
+
attempt = each_task.metadata_dict.get("attempt")
|
|
66
|
+
flow_name, run_id, step_name, task_id = task_pathspec.split("/")
|
|
67
|
+
running_invocations.append(
|
|
68
|
+
f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
|
|
69
|
+
)
|
|
70
70
|
|
|
71
71
|
if running_invocations:
|
|
72
72
|
for each_invocation in running_invocations:
|
|
73
73
|
ctx.obj.echo(each_invocation)
|
|
74
|
-
else:
|
|
75
|
-
ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
|
|
76
74
|
|
|
77
75
|
|
|
78
76
|
@nvidia.command(help="Kill steps / tasks running as an nvidia job.")
|
|
@@ -88,29 +86,17 @@ def kill(ctx, run_id):
|
|
|
88
86
|
HeartbeatStore,
|
|
89
87
|
)
|
|
90
88
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
not each_step.task.finished
|
|
97
|
-
and "nvcf-function-id" in each_step.task.metadata_dict
|
|
98
|
-
):
|
|
99
|
-
task_pathspec = each_step.task.pathspec
|
|
100
|
-
attempt = each_step.task.metadata_dict.get("attempt")
|
|
101
|
-
heartbeat_prefix = "{task_pathspec}/{attempt}".format(
|
|
102
|
-
task_pathspec=task_pathspec, attempt=attempt
|
|
103
|
-
)
|
|
89
|
+
datastore_root = ctx.obj.datastore_impl.datastore_root
|
|
90
|
+
store = HeartbeatStore(
|
|
91
|
+
main_pid=None,
|
|
92
|
+
storage_backend=ctx.obj.datastore_impl(datastore_root),
|
|
93
|
+
)
|
|
104
94
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
)
|
|
111
|
-
store.emit_tombstone(folder_name="nvcf_heartbeats")
|
|
112
|
-
else:
|
|
113
|
-
ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
|
|
95
|
+
flow_name = ctx.obj.flow.name
|
|
96
|
+
tombstone_prefix = f"{flow_name}/{run_id}"
|
|
97
|
+
store.emit_tombstone(
|
|
98
|
+
tombstone_prefix=tombstone_prefix, folder_name="nvcf_heartbeats"
|
|
99
|
+
)
|
|
114
100
|
|
|
115
101
|
|
|
116
102
|
@nvidia.command(
|
|
@@ -31,7 +31,7 @@ from metaflow.mflog import (
|
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
from .snowpark_client import SnowparkClient
|
|
34
|
-
from .snowpark_exceptions import SnowparkException
|
|
34
|
+
from .snowpark_exceptions import SnowparkException, SnowparkKilledException
|
|
35
35
|
from .snowpark_job import SnowparkJob
|
|
36
36
|
|
|
37
37
|
# Redirect structured logs to $PWD/.logs/
|
|
@@ -291,7 +291,7 @@ class Snowpark(object):
|
|
|
291
291
|
else:
|
|
292
292
|
if self.job.is_running:
|
|
293
293
|
# Kill the job if it is still running by throwing an exception.
|
|
294
|
-
raise
|
|
294
|
+
raise SnowparkKilledException("Task failed!")
|
|
295
295
|
echo(
|
|
296
296
|
"Task finished with message '%s'." % self.job.message,
|
|
297
297
|
"stderr",
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
version = "1.1.
|
|
5
|
+
version = "1.1.92"
|
|
6
6
|
this_directory = Path(__file__).parent
|
|
7
7
|
long_description = (this_directory / "README.md").read_text()
|
|
8
8
|
|
|
@@ -18,6 +18,6 @@ setup(
|
|
|
18
18
|
install_requires=[
|
|
19
19
|
"boto3",
|
|
20
20
|
"kubernetes",
|
|
21
|
-
"ob-metaflow == 2.12.
|
|
21
|
+
"ob-metaflow == 2.12.20.1",
|
|
22
22
|
],
|
|
23
23
|
)
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
import time
|
|
4
|
-
|
|
5
|
-
from metaflow.exception import MetaflowException
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class KubernetesClientException(MetaflowException):
|
|
12
|
-
headline = "Kubernetes client error"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class KubernetesClient(object):
|
|
16
|
-
def __init__(self):
|
|
17
|
-
try:
|
|
18
|
-
# Kubernetes is a soft dependency.
|
|
19
|
-
from kubernetes import client, config
|
|
20
|
-
except (NameError, ImportError):
|
|
21
|
-
raise KubernetesClientException(
|
|
22
|
-
"Could not import module 'kubernetes'.\n\nInstall Kubernetes "
|
|
23
|
-
"Python package (https://pypi.org/project/kubernetes/) first.\n"
|
|
24
|
-
"You can install the module by executing - "
|
|
25
|
-
"%s -m pip install kubernetes\n"
|
|
26
|
-
"or equivalent through your favorite Python package manager."
|
|
27
|
-
% sys.executable
|
|
28
|
-
)
|
|
29
|
-
self._refresh_client()
|
|
30
|
-
|
|
31
|
-
def _refresh_client(self):
|
|
32
|
-
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
33
|
-
from kubernetes import client
|
|
34
|
-
|
|
35
|
-
config = client.Configuration()
|
|
36
|
-
token_info = get_token("/generate/k8s")
|
|
37
|
-
config.host = token_info["endpoint"]
|
|
38
|
-
config.api_key["authorization"] = "Bearer " + token_info["token"]
|
|
39
|
-
config.verify_ssl = False # TODO: FIX THIS
|
|
40
|
-
client.Configuration.set_default(config)
|
|
41
|
-
self._client = client
|
|
42
|
-
self._client_refresh_timestamp = time.time()
|
|
43
|
-
|
|
44
|
-
def get(self):
|
|
45
|
-
if (
|
|
46
|
-
time.time() - self._client_refresh_timestamp
|
|
47
|
-
> CLIENT_REFRESH_INTERVAL_SECONDS
|
|
48
|
-
):
|
|
49
|
-
self._refresh_client()
|
|
50
|
-
|
|
51
|
-
return self._client
|
|
52
|
-
|
|
53
|
-
def job(self, **kwargs):
|
|
54
|
-
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|
|
55
|
-
|
|
56
|
-
return KubernetesJob(self, **kwargs)
|
|
57
|
-
|
|
58
|
-
def jobset(self, **kwargs):
|
|
59
|
-
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
|
|
60
|
-
|
|
61
|
-
return KubernetesJobSet(self, **kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|