ob-metaflow 2.11.4.9__py2.py3-none-any.whl → 2.11.8.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/cli.py +15 -10
- metaflow/clone_util.py +71 -0
- metaflow/cmd/develop/stub_generator.py +2 -0
- metaflow/cmd/develop/stubs.py +17 -8
- metaflow/metaflow_config.py +3 -0
- metaflow/package.py +4 -3
- metaflow/parameters.py +2 -2
- metaflow/plugins/aws/batch/batch.py +12 -0
- metaflow/plugins/aws/batch/batch_cli.py +25 -0
- metaflow/plugins/aws/batch/batch_client.py +40 -0
- metaflow/plugins/aws/batch/batch_decorator.py +32 -1
- metaflow/plugins/aws/step_functions/step_functions.py +3 -0
- metaflow/plugins/datatools/s3/s3op.py +4 -3
- metaflow/plugins/env_escape/client.py +154 -27
- metaflow/plugins/env_escape/client_modules.py +15 -47
- metaflow/plugins/env_escape/configurations/emulate_test_lib/overrides.py +31 -42
- metaflow/plugins/env_escape/configurations/emulate_test_lib/server_mappings.py +8 -3
- metaflow/plugins/env_escape/configurations/test_lib_impl/test_lib.py +74 -22
- metaflow/plugins/env_escape/consts.py +1 -0
- metaflow/plugins/env_escape/exception_transferer.py +46 -112
- metaflow/plugins/env_escape/override_decorators.py +8 -8
- metaflow/plugins/env_escape/server.py +42 -5
- metaflow/plugins/env_escape/stub.py +168 -23
- metaflow/plugins/env_escape/utils.py +3 -3
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +3 -2
- metaflow/plugins/pypi/conda_environment.py +9 -0
- metaflow/plugins/pypi/pip.py +17 -2
- metaflow/runtime.py +252 -61
- metaflow/sidecar/sidecar.py +11 -1
- metaflow/sidecar/sidecar_subprocess.py +34 -18
- metaflow/task.py +28 -54
- metaflow/version.py +1 -1
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/RECORD +38 -37
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/top_level.txt +0 -0
|
@@ -13,12 +13,12 @@ def get_methods(class_object):
|
|
|
13
13
|
for base_class in mros:
|
|
14
14
|
all_attributes.update(base_class.__dict__)
|
|
15
15
|
for name, attribute in all_attributes.items():
|
|
16
|
-
if
|
|
17
|
-
all_methods[name] = inspect.getdoc(attribute)
|
|
18
|
-
elif isinstance(attribute, staticmethod):
|
|
16
|
+
if isinstance(attribute, staticmethod):
|
|
19
17
|
all_methods["___s___%s" % name] = inspect.getdoc(attribute)
|
|
20
18
|
elif isinstance(attribute, classmethod):
|
|
21
19
|
all_methods["___c___%s" % name] = inspect.getdoc(attribute)
|
|
20
|
+
elif hasattr(attribute, "__call__"):
|
|
21
|
+
all_methods[name] = inspect.getdoc(attribute)
|
|
22
22
|
return all_methods
|
|
23
23
|
|
|
24
24
|
|
|
@@ -90,7 +90,7 @@ class GcpSecretManagerSecretsProvider(SecretsProvider):
|
|
|
90
90
|
# The latter two forms require METAFLOW_GCP_SECRET_MANAGER_PREFIX to be set.
|
|
91
91
|
|
|
92
92
|
match_full = re.match(r"^projects/\d+/secrets/([\w\-]+)(/versions/([\w\-]+))?$", secret_id)
|
|
93
|
-
match_partial = re.match(r"^[\w\-]+(/versions/[\w\-]+)?$", secret_id)
|
|
93
|
+
match_partial = re.match(r"^([\w\-]+)(/versions/[\w\-]+)?$", secret_id)
|
|
94
94
|
if match_full:
|
|
95
95
|
# Full path
|
|
96
96
|
env_var_name = match_full.group(1)
|
|
@@ -107,9 +107,10 @@ class GcpSecretManagerSecretsProvider(SecretsProvider):
|
|
|
107
107
|
raise ValueError(
|
|
108
108
|
f"Cannot use simple secret_id without setting METAFLOW_GCP_SECRET_MANAGER_PREFIX. {GCP_SECRET_MANAGER_PREFIX}"
|
|
109
109
|
)
|
|
110
|
-
if match_partial.group(
|
|
110
|
+
if match_partial.group(2):
|
|
111
111
|
# With version specified
|
|
112
112
|
full_secret_name = f"{GCP_SECRET_MANAGER_PREFIX}{secret_id}"
|
|
113
|
+
env_var_name = match_partial.group(1)
|
|
113
114
|
else:
|
|
114
115
|
# No version specified, use latest
|
|
115
116
|
full_secret_name = (
|
|
@@ -282,6 +282,15 @@ class CondaEnvironment(MetaflowEnvironment):
|
|
|
282
282
|
# Match PyPI and Conda python versions with the resolved environment Python.
|
|
283
283
|
environment["pypi"]["python"] = environment["conda"]["python"] = env_python
|
|
284
284
|
|
|
285
|
+
# When using `Application Default Credentials` for private GCP
|
|
286
|
+
# PyPI registries, the usage of environment variable `GOOGLE_APPLICATION_CREDENTIALS`
|
|
287
|
+
# demands that `keyrings.google-artifactregistry-auth` has to be installed
|
|
288
|
+
# and available in the underlying python environment.
|
|
289
|
+
if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
|
|
290
|
+
environment["conda"]["packages"][
|
|
291
|
+
"keyrings.google-artifactregistry-auth"
|
|
292
|
+
] = ">=1.1.1"
|
|
293
|
+
|
|
285
294
|
# Z combinator for a recursive lambda
|
|
286
295
|
deep_sort = (lambda f: f(f))(
|
|
287
296
|
lambda f: lambda obj: (
|
metaflow/plugins/pypi/pip.py
CHANGED
|
@@ -92,7 +92,14 @@ class Pip(object):
|
|
|
92
92
|
# so using @branch as a version acts as expected.
|
|
93
93
|
vcs_info = dl_info.get("vcs_info")
|
|
94
94
|
if vcs_info:
|
|
95
|
-
|
|
95
|
+
subdirectory = dl_info.get("subdirectory")
|
|
96
|
+
res["url"] = "{vcs}+{url}@{commit_id}{subdir_str}".format(
|
|
97
|
+
**vcs_info,
|
|
98
|
+
**res,
|
|
99
|
+
subdir_str="#subdirectory=%s" % subdirectory
|
|
100
|
+
if subdirectory
|
|
101
|
+
else ""
|
|
102
|
+
)
|
|
96
103
|
# used to deduplicate the storage location in case wheel does not
|
|
97
104
|
# build with enough unique identifiers.
|
|
98
105
|
res["hash"] = vcs_info["commit_id"]
|
|
@@ -270,9 +277,17 @@ class Pip(object):
|
|
|
270
277
|
prefix,
|
|
271
278
|
"pip3",
|
|
272
279
|
"--disable-pip-version-check",
|
|
273
|
-
"--no-input",
|
|
274
280
|
"--no-color",
|
|
275
281
|
]
|
|
282
|
+
# credentials are being determined from the JSON file referenced by
|
|
283
|
+
# the GOOGLE_APPLICATION_CREDENTIALS environment variable and are
|
|
284
|
+
# probably injected dynamically via `keyrings.google-artifactregistry-auth`
|
|
285
|
+
# Thus, we avoid passing `--no-input` in this case.
|
|
286
|
+
+ (
|
|
287
|
+
["--no-input"]
|
|
288
|
+
if os.getenv("GOOGLE_APPLICATION_CREDENTIALS") is None
|
|
289
|
+
else []
|
|
290
|
+
)
|
|
276
291
|
+ (["--isolated"] if isolated else [])
|
|
277
292
|
+ args,
|
|
278
293
|
stderr=subprocess.PIPE,
|
metaflow/runtime.py
CHANGED
|
@@ -13,6 +13,7 @@ import subprocess
|
|
|
13
13
|
from datetime import datetime
|
|
14
14
|
from io import BytesIO
|
|
15
15
|
from functools import partial
|
|
16
|
+
from concurrent import futures
|
|
16
17
|
|
|
17
18
|
from metaflow.datastore.exceptions import DataException
|
|
18
19
|
|
|
@@ -30,6 +31,7 @@ from .debug import debug
|
|
|
30
31
|
from .decorators import flow_decorators
|
|
31
32
|
from .mflog import mflog, RUNTIME_LOG_SOURCE
|
|
32
33
|
from .util import to_unicode, compress_list, unicode_type
|
|
34
|
+
from .clone_util import clone_task_helper
|
|
33
35
|
from .unbounded_foreach import (
|
|
34
36
|
CONTROL_TASK_TAG,
|
|
35
37
|
UBF_CONTROL,
|
|
@@ -46,6 +48,7 @@ PROGRESS_INTERVAL = 300 # s
|
|
|
46
48
|
# executing a flow. These are prefetched during the resume operation by
|
|
47
49
|
# leveraging the TaskDataStoreSet.
|
|
48
50
|
PREFETCH_DATA_ARTIFACTS = ["_foreach_stack", "_task_ok", "_transition"]
|
|
51
|
+
RESUME_POLL_SECONDS = 60
|
|
49
52
|
|
|
50
53
|
# Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
|
|
51
54
|
# formats according to mflog. See a comment in mflog.__init__
|
|
@@ -75,8 +78,8 @@ class NativeRuntime(object):
|
|
|
75
78
|
max_workers=MAX_WORKERS,
|
|
76
79
|
max_num_splits=MAX_NUM_SPLITS,
|
|
77
80
|
max_log_size=MAX_LOG_SIZE,
|
|
81
|
+
resume_identifier=None,
|
|
78
82
|
):
|
|
79
|
-
|
|
80
83
|
if run_id is None:
|
|
81
84
|
self._run_id = metadata.new_run_id()
|
|
82
85
|
else:
|
|
@@ -101,11 +104,13 @@ class NativeRuntime(object):
|
|
|
101
104
|
self._entrypoint = entrypoint
|
|
102
105
|
self.event_logger = event_logger
|
|
103
106
|
self._monitor = monitor
|
|
107
|
+
self._resume_identifier = resume_identifier
|
|
104
108
|
|
|
105
109
|
self._clone_run_id = clone_run_id
|
|
106
110
|
self._clone_only = clone_only
|
|
107
111
|
self._clone_steps = {} if clone_steps is None else clone_steps
|
|
108
112
|
self._reentrant = reentrant
|
|
113
|
+
self._run_url = None
|
|
109
114
|
|
|
110
115
|
self._origin_ds_set = None
|
|
111
116
|
if clone_run_id:
|
|
@@ -184,7 +189,8 @@ class NativeRuntime(object):
|
|
|
184
189
|
origin_ds_set=self._origin_ds_set,
|
|
185
190
|
decos=decos,
|
|
186
191
|
logger=self._logger,
|
|
187
|
-
|
|
192
|
+
resume_identifier=self._resume_identifier,
|
|
193
|
+
**kwargs,
|
|
188
194
|
)
|
|
189
195
|
|
|
190
196
|
@property
|
|
@@ -192,25 +198,25 @@ class NativeRuntime(object):
|
|
|
192
198
|
return self._run_id
|
|
193
199
|
|
|
194
200
|
def persist_constants(self, task_id=None):
|
|
195
|
-
|
|
196
|
-
if not
|
|
197
|
-
|
|
198
|
-
self._params_task = task.path
|
|
199
|
-
self._is_cloned[task.path] = task.is_cloned
|
|
201
|
+
self._params_task = self._new_task("_parameters", task_id=task_id)
|
|
202
|
+
if not self._params_task.is_cloned:
|
|
203
|
+
self._params_task.persist(self._flow)
|
|
200
204
|
|
|
201
|
-
|
|
202
|
-
|
|
205
|
+
self._is_cloned[self._params_task.path] = self._params_task.is_cloned
|
|
206
|
+
|
|
207
|
+
def print_workflow_info(self):
|
|
208
|
+
self._run_url = (
|
|
203
209
|
"%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
|
|
204
210
|
if UI_URL
|
|
205
211
|
else None
|
|
206
212
|
)
|
|
207
213
|
|
|
208
|
-
if
|
|
214
|
+
if self._run_url:
|
|
209
215
|
self._logger(
|
|
210
216
|
"Workflow starting (run-id %s), see it in the UI at %s"
|
|
211
217
|
% (
|
|
212
218
|
self._run_id,
|
|
213
|
-
|
|
219
|
+
self._run_url,
|
|
214
220
|
),
|
|
215
221
|
system_msg=True,
|
|
216
222
|
)
|
|
@@ -219,10 +225,88 @@ class NativeRuntime(object):
|
|
|
219
225
|
"Workflow starting (run-id %s):" % self._run_id, system_msg=True
|
|
220
226
|
)
|
|
221
227
|
|
|
228
|
+
def _should_skip_clone_only_execution(self):
|
|
229
|
+
if self._clone_only and self._params_task:
|
|
230
|
+
if self._params_task.resume_done():
|
|
231
|
+
return True, "Resume already complete. Skip clone-only execution."
|
|
232
|
+
if not self._params_task.is_resume_leader():
|
|
233
|
+
return (
|
|
234
|
+
True,
|
|
235
|
+
"Not resume leader under resume execution. Skip clone-only execution.",
|
|
236
|
+
)
|
|
237
|
+
return False, None
|
|
238
|
+
|
|
239
|
+
def clone_task(self, step_name, task_id):
|
|
240
|
+
self._logger(
|
|
241
|
+
"Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
|
|
242
|
+
self._flow.name,
|
|
243
|
+
self._clone_run_id,
|
|
244
|
+
step_name,
|
|
245
|
+
task_id,
|
|
246
|
+
self._flow.name,
|
|
247
|
+
self._run_id,
|
|
248
|
+
step_name,
|
|
249
|
+
task_id,
|
|
250
|
+
),
|
|
251
|
+
system_msg=True,
|
|
252
|
+
)
|
|
253
|
+
clone_task_helper(
|
|
254
|
+
self._flow.name,
|
|
255
|
+
self._clone_run_id,
|
|
256
|
+
self._run_id,
|
|
257
|
+
step_name,
|
|
258
|
+
task_id, # origin_task_id
|
|
259
|
+
task_id,
|
|
260
|
+
self._flow_datastore,
|
|
261
|
+
self._metadata,
|
|
262
|
+
origin_ds_set=self._origin_ds_set,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def clone_original_run(self):
|
|
266
|
+
(
|
|
267
|
+
should_skip_clone_only_execution,
|
|
268
|
+
skip_reason,
|
|
269
|
+
) = self._should_skip_clone_only_execution()
|
|
270
|
+
if should_skip_clone_only_execution:
|
|
271
|
+
self._logger(skip_reason, system_msg=True)
|
|
272
|
+
return
|
|
273
|
+
self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
|
|
274
|
+
self._logger(
|
|
275
|
+
"Start cloning original run: {}/{}".format(
|
|
276
|
+
self._flow.name, self._clone_run_id
|
|
277
|
+
),
|
|
278
|
+
system_msg=True,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
inputs = []
|
|
282
|
+
|
|
283
|
+
for task_ds in self._origin_ds_set:
|
|
284
|
+
_, step_name, task_id = task_ds.pathspec.split("/")
|
|
285
|
+
if task_ds["_task_ok"] and step_name != "_parameters":
|
|
286
|
+
inputs.append((step_name, task_id))
|
|
287
|
+
|
|
288
|
+
with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
|
|
289
|
+
all_tasks = [
|
|
290
|
+
executor.submit(self.clone_task, step_name, task_id)
|
|
291
|
+
for (step_name, task_id) in inputs
|
|
292
|
+
]
|
|
293
|
+
_, _ = futures.wait(all_tasks)
|
|
294
|
+
self._logger("Cloning original run is done", system_msg=True)
|
|
295
|
+
self._params_task.mark_resume_done()
|
|
296
|
+
self._metadata.stop_heartbeat()
|
|
297
|
+
|
|
298
|
+
def execute(self):
|
|
299
|
+
(
|
|
300
|
+
should_skip_clone_only_execution,
|
|
301
|
+
skip_reason,
|
|
302
|
+
) = self._should_skip_clone_only_execution()
|
|
303
|
+
if should_skip_clone_only_execution:
|
|
304
|
+
self._logger(skip_reason, system_msg=True)
|
|
305
|
+
return
|
|
222
306
|
self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
|
|
223
307
|
|
|
224
308
|
if self._params_task:
|
|
225
|
-
self._queue_push("start", {"input_paths": [self._params_task]})
|
|
309
|
+
self._queue_push("start", {"input_paths": [self._params_task.path]})
|
|
226
310
|
else:
|
|
227
311
|
self._queue_push("start", {})
|
|
228
312
|
|
|
@@ -303,9 +387,9 @@ class NativeRuntime(object):
|
|
|
303
387
|
|
|
304
388
|
# assert that end was executed and it was successful
|
|
305
389
|
if ("end", ()) in self._finished:
|
|
306
|
-
if
|
|
390
|
+
if self._run_url:
|
|
307
391
|
self._logger(
|
|
308
|
-
"Done! See the run in the UI at %s" %
|
|
392
|
+
"Done! See the run in the UI at %s" % self._run_url,
|
|
309
393
|
system_msg=True,
|
|
310
394
|
)
|
|
311
395
|
else:
|
|
@@ -316,6 +400,7 @@ class NativeRuntime(object):
|
|
|
316
400
|
"cloned; no new tasks executed!",
|
|
317
401
|
system_msg=True,
|
|
318
402
|
)
|
|
403
|
+
self._params_task.mark_resume_done()
|
|
319
404
|
else:
|
|
320
405
|
raise MetaflowInternalError(
|
|
321
406
|
"The *end* step was not successful by the end of flow."
|
|
@@ -711,6 +796,7 @@ class Task(object):
|
|
|
711
796
|
ubf_iter=None,
|
|
712
797
|
join_type=None,
|
|
713
798
|
task_id=None,
|
|
799
|
+
resume_identifier=None,
|
|
714
800
|
):
|
|
715
801
|
|
|
716
802
|
self.step = step
|
|
@@ -749,32 +835,38 @@ class Task(object):
|
|
|
749
835
|
self.datastore_sysroot = flow_datastore.datastore_root
|
|
750
836
|
self._results_ds = None
|
|
751
837
|
|
|
838
|
+
# Only used in clone-only resume.
|
|
839
|
+
self._is_resume_leader = None
|
|
840
|
+
self._resume_done = None
|
|
841
|
+
self._resume_identifier = resume_identifier
|
|
842
|
+
|
|
752
843
|
origin = None
|
|
753
844
|
if clone_run_id and may_clone:
|
|
754
845
|
origin = self._find_origin_task(clone_run_id, join_type)
|
|
755
846
|
if origin and origin["_task_ok"]:
|
|
756
847
|
# At this point, we know we are going to clone
|
|
757
848
|
self._is_cloned = True
|
|
849
|
+
|
|
850
|
+
task_id_exists_already = False
|
|
851
|
+
task_completed = False
|
|
758
852
|
if reentrant:
|
|
759
853
|
# A re-entrant clone basically allows multiple concurrent processes
|
|
760
854
|
# to perform the clone at the same time to the same new run id. Let's
|
|
761
855
|
# assume two processes A and B both simultaneously calling
|
|
762
856
|
# `resume --reentrant --run-id XX`.
|
|
763
|
-
#
|
|
764
|
-
# -
|
|
765
|
-
#
|
|
766
|
-
#
|
|
767
|
-
#
|
|
768
|
-
# and also guarantees that we only write once to the datastore and
|
|
769
|
-
# metadata.
|
|
857
|
+
# We want to guarantee that:
|
|
858
|
+
# - All incomplete tasks are cloned exactly once.
|
|
859
|
+
# To achieve this, we will select a resume leader and let it clone the
|
|
860
|
+
# entire execution graph. This ensures that we only write once to the
|
|
861
|
+
# datastore and metadata.
|
|
770
862
|
#
|
|
771
|
-
#
|
|
772
|
-
#
|
|
773
|
-
#
|
|
774
|
-
#
|
|
775
|
-
#
|
|
863
|
+
# We use the cloned _parameter task's task-id as the "key" to synchronize
|
|
864
|
+
# on. We try to "register" this new task-id (or rather the full pathspec
|
|
865
|
+
# <run>/<step>/<taskid>) with the metadata service which will indicate
|
|
866
|
+
# if we actually registered it or if it existed already. If we did manage
|
|
867
|
+
# to register it (_parameter task), we are the "elected resume leader"
|
|
776
868
|
# in essence and proceed to clone. If we didn't, we just wait to make
|
|
777
|
-
# sure the
|
|
869
|
+
# sure the entire clone execution is fully done (ie: the clone is finished).
|
|
778
870
|
if task_id is not None:
|
|
779
871
|
# Sanity check -- this should never happen. We cannot allow
|
|
780
872
|
# for explicit task-ids because in the reentrant case, we use the
|
|
@@ -783,6 +875,15 @@ class Task(object):
|
|
|
783
875
|
"Reentrant clone-only resume does not allow for explicit task-id"
|
|
784
876
|
)
|
|
785
877
|
|
|
878
|
+
if resume_identifier:
|
|
879
|
+
self.log(
|
|
880
|
+
"Resume identifier is %s." % resume_identifier,
|
|
881
|
+
system_msg=True,
|
|
882
|
+
)
|
|
883
|
+
else:
|
|
884
|
+
raise MetaflowInternalError(
|
|
885
|
+
"Reentrant clone-only resume needs a resume identifier."
|
|
886
|
+
)
|
|
786
887
|
# We will use the same task_id as the original task
|
|
787
888
|
# to use it effectively as a synchronization key
|
|
788
889
|
clone_task_id = origin.task_id
|
|
@@ -798,56 +899,124 @@ class Task(object):
|
|
|
798
899
|
|
|
799
900
|
# If _get_task_id returns True it means the task already existed, so
|
|
800
901
|
# we wait for it.
|
|
801
|
-
|
|
902
|
+
task_id_exists_already = self._get_task_id(clone_task_id)
|
|
903
|
+
|
|
904
|
+
# We may not have access to task datastore on first resume attempt, but
|
|
905
|
+
# on later resume attempt, we should check if the resume task is complete
|
|
906
|
+
# or not. This is to fix the issue where the resume leader was killed
|
|
907
|
+
# unexpectedly during cloning and never mark task complete.
|
|
908
|
+
try:
|
|
909
|
+
task_completed = self.results["_task_ok"]
|
|
910
|
+
except DataException as e:
|
|
911
|
+
pass
|
|
802
912
|
else:
|
|
803
|
-
self._wait_for_clone = False
|
|
804
913
|
self._get_task_id(task_id)
|
|
805
914
|
|
|
806
915
|
# Store the mapping from current_pathspec -> origin_pathspec which
|
|
807
916
|
# will be useful for looking up origin_ds_set in find_origin_task.
|
|
808
917
|
self.clone_pathspec_mapping[self._path] = origin.pathspec
|
|
809
918
|
if self.step == "_parameters":
|
|
810
|
-
#
|
|
811
|
-
|
|
812
|
-
|
|
919
|
+
# In the _parameters task, we need to resolve who is the resume leader.
|
|
920
|
+
self._is_resume_leader = False
|
|
921
|
+
resume_leader = None
|
|
922
|
+
|
|
923
|
+
if task_id_exists_already:
|
|
924
|
+
# If the task id already exists, we need to check if current task is the resume leader in previous attempt.
|
|
925
|
+
ds = self._flow_datastore.get_task_datastore(
|
|
926
|
+
self.run_id, self.step, self.task_id
|
|
927
|
+
)
|
|
928
|
+
if not ds["_task_ok"]:
|
|
929
|
+
raise MetaflowInternalError(
|
|
930
|
+
"Externally cloned _parameters task did not succeed"
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Check if we should be the resume leader (maybe from previous attempt).
|
|
934
|
+
# To avoid the edge case where the resume leader is selected but has not
|
|
935
|
+
# yet written the _resume_leader metadata, we will wait for a few seconds.
|
|
936
|
+
# We will wait for resume leader for at most 3 times.
|
|
937
|
+
for resume_leader_wait_retry in range(3):
|
|
938
|
+
|
|
939
|
+
if ds.has_metadata("_resume_leader", add_attempt=False):
|
|
940
|
+
resume_leader = ds.load_metadata(
|
|
941
|
+
["_resume_leader"], add_attempt=False
|
|
942
|
+
)["_resume_leader"]
|
|
943
|
+
self._is_resume_leader = resume_leader == resume_identifier
|
|
944
|
+
else:
|
|
945
|
+
self.log(
|
|
946
|
+
"Waiting for resume leader to be selected. Sleeping ...",
|
|
947
|
+
system_msg=True,
|
|
948
|
+
)
|
|
949
|
+
time.sleep(3)
|
|
950
|
+
else:
|
|
951
|
+
# If the task id does not exist, current task is the resume leader.
|
|
952
|
+
resume_leader = resume_identifier
|
|
953
|
+
self._is_resume_leader = True
|
|
954
|
+
|
|
955
|
+
if reentrant:
|
|
956
|
+
if resume_leader:
|
|
957
|
+
self.log(
|
|
958
|
+
"Resume leader is %s." % resume_leader,
|
|
959
|
+
system_msg=True,
|
|
960
|
+
)
|
|
961
|
+
else:
|
|
962
|
+
raise MetaflowInternalError(
|
|
963
|
+
"Can not determine the resume leader in distributed resume mode."
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
if self._is_resume_leader:
|
|
967
|
+
self.log(
|
|
968
|
+
"Selected as the reentrant clone leader.",
|
|
969
|
+
system_msg=True,
|
|
970
|
+
)
|
|
813
971
|
# Clone in place without relying on run_queue.
|
|
814
972
|
self.new_attempt()
|
|
815
973
|
self._ds.clone(origin)
|
|
974
|
+
# Set the resume leader be the task that calls the resume (first task to clone _parameters task).
|
|
975
|
+
# We will always set resume leader regardless whether we are in distributed resume case or not.
|
|
976
|
+
if resume_identifier:
|
|
977
|
+
self._ds.save_metadata(
|
|
978
|
+
{"_resume_leader": resume_identifier}, add_attempt=False
|
|
979
|
+
)
|
|
980
|
+
|
|
816
981
|
self._ds.done()
|
|
817
982
|
else:
|
|
818
|
-
#
|
|
819
|
-
# clone_only function here
|
|
820
|
-
self.log(
|
|
821
|
-
"Waiting for clone of _parameters step to occur...",
|
|
822
|
-
system_msg=True,
|
|
823
|
-
)
|
|
983
|
+
# Wait for the resume leader to complete
|
|
824
984
|
while True:
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
985
|
+
ds = self._flow_datastore.get_task_datastore(
|
|
986
|
+
self.run_id, self.step, self.task_id
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
# Check if resume is complete. Resume leader will write the done file.
|
|
990
|
+
self._resume_done = ds.has_metadata(
|
|
991
|
+
"_resume_done", add_attempt=False
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
if self._resume_done:
|
|
833
995
|
break
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
996
|
+
|
|
997
|
+
self.log(
|
|
998
|
+
"Waiting for resume leader to complete. Sleeping for %ds..."
|
|
999
|
+
% RESUME_POLL_SECONDS,
|
|
1000
|
+
system_msg=True,
|
|
1001
|
+
)
|
|
1002
|
+
time.sleep(RESUME_POLL_SECONDS)
|
|
1003
|
+
self.log(
|
|
1004
|
+
"_parameters clone completed by resume leader", system_msg=True
|
|
1005
|
+
)
|
|
839
1006
|
else:
|
|
840
|
-
#
|
|
1007
|
+
# Only leader can reach non-parameter steps in resume.
|
|
1008
|
+
|
|
841
1009
|
# Store the origin pathspec in clone_origin so this can be run
|
|
842
1010
|
# as a task by the runtime.
|
|
843
1011
|
self.clone_origin = origin.pathspec
|
|
844
1012
|
# Save a call to creating the results_ds since its same as origin.
|
|
845
1013
|
self._results_ds = origin
|
|
846
|
-
|
|
1014
|
+
|
|
1015
|
+
# If the task is already completed in new run, we don't need to clone it.
|
|
1016
|
+
self._should_skip_cloning = task_completed
|
|
1017
|
+
if self._should_skip_cloning:
|
|
847
1018
|
self.log(
|
|
848
|
-
"
|
|
849
|
-
"of a previously run task %s (this may take some time)"
|
|
850
|
-
% self.clone_origin,
|
|
1019
|
+
"Skip cloning of previously run task %s" % self.clone_origin,
|
|
851
1020
|
system_msg=True,
|
|
852
1021
|
)
|
|
853
1022
|
else:
|
|
@@ -916,6 +1085,31 @@ class Task(object):
|
|
|
916
1085
|
self._logger(msg, head=prefix, system_msg=system_msg, timestamp=timestamp)
|
|
917
1086
|
sys.stdout.flush()
|
|
918
1087
|
|
|
1088
|
+
def is_resume_leader(self):
|
|
1089
|
+
assert (
|
|
1090
|
+
self.step == "_parameters"
|
|
1091
|
+
), "Only _parameters step can check resume leader."
|
|
1092
|
+
return self._is_resume_leader
|
|
1093
|
+
|
|
1094
|
+
def resume_done(self):
|
|
1095
|
+
assert (
|
|
1096
|
+
self.step == "_parameters"
|
|
1097
|
+
), "Only _parameters step can check wheather resume is complete."
|
|
1098
|
+
return self._resume_done
|
|
1099
|
+
|
|
1100
|
+
def mark_resume_done(self):
|
|
1101
|
+
assert (
|
|
1102
|
+
self.step == "_parameters"
|
|
1103
|
+
), "Only _parameters step can mark resume as done."
|
|
1104
|
+
assert self.is_resume_leader(), "Only resume leader can mark resume as done."
|
|
1105
|
+
|
|
1106
|
+
# Mark the resume as done. This is called at the end of the resume flow and after
|
|
1107
|
+
# the _parameters step was successfully cloned, so we need to 'dangerously' save
|
|
1108
|
+
# this done file, but the risk should be minimal.
|
|
1109
|
+
self._ds._dangerous_save_metadata_post_done(
|
|
1110
|
+
{"_resume_done": True}, add_attempt=False
|
|
1111
|
+
)
|
|
1112
|
+
|
|
919
1113
|
def _get_task_id(self, task_id):
|
|
920
1114
|
already_existed = True
|
|
921
1115
|
if self.ubf_context == UBF_CONTROL:
|
|
@@ -1040,8 +1234,8 @@ class Task(object):
|
|
|
1040
1234
|
return self._is_cloned
|
|
1041
1235
|
|
|
1042
1236
|
@property
|
|
1043
|
-
def
|
|
1044
|
-
return self.
|
|
1237
|
+
def should_skip_cloning(self):
|
|
1238
|
+
return self._should_skip_cloning
|
|
1045
1239
|
|
|
1046
1240
|
def persist(self, flow):
|
|
1047
1241
|
# this is used to persist parameters before the start step
|
|
@@ -1185,7 +1379,6 @@ class CLIArgs(object):
|
|
|
1185
1379
|
|
|
1186
1380
|
class Worker(object):
|
|
1187
1381
|
def __init__(self, task, max_logs_size):
|
|
1188
|
-
|
|
1189
1382
|
self.task = task
|
|
1190
1383
|
self._proc = self._launch()
|
|
1191
1384
|
|
|
@@ -1229,8 +1422,6 @@ class Worker(object):
|
|
|
1229
1422
|
# disabling sidecars for cloned tasks due to perf reasons
|
|
1230
1423
|
args.top_level_options["event-logger"] = "nullSidecarLogger"
|
|
1231
1424
|
args.top_level_options["monitor"] = "nullSidecarMonitor"
|
|
1232
|
-
if self.task.wait_for_clone:
|
|
1233
|
-
args.command_options["clone-wait-only"] = True
|
|
1234
1425
|
else:
|
|
1235
1426
|
# decorators may modify the CLIArgs object in-place
|
|
1236
1427
|
for deco in self.task.decos:
|
metaflow/sidecar/sidecar.py
CHANGED
|
@@ -13,14 +13,24 @@ class Sidecar(object):
|
|
|
13
13
|
if t is not None and t.get_worker() is not None:
|
|
14
14
|
self._has_valid_worker = True
|
|
15
15
|
self.sidecar_process = None
|
|
16
|
+
# Whether to send msg in a thread-safe fashion.
|
|
17
|
+
self._threadsafe_send_enabled = False
|
|
16
18
|
|
|
17
19
|
def start(self):
|
|
18
20
|
if not self.is_active and self._has_valid_worker:
|
|
19
21
|
self.sidecar_process = SidecarSubProcess(self._sidecar_type)
|
|
20
22
|
|
|
23
|
+
def enable_threadsafe_send(self):
|
|
24
|
+
self._threadsafe_send_enabled = True
|
|
25
|
+
|
|
26
|
+
def disable_threadsafe_send(self):
|
|
27
|
+
self._threadsafe_send_enabled = False
|
|
28
|
+
|
|
21
29
|
def send(self, msg):
|
|
22
30
|
if self.is_active:
|
|
23
|
-
self.sidecar_process.send(
|
|
31
|
+
self.sidecar_process.send(
|
|
32
|
+
msg, thread_safe_send=self._threadsafe_send_enabled
|
|
33
|
+
)
|
|
24
34
|
|
|
25
35
|
def terminate(self):
|
|
26
36
|
if self.is_active:
|