ob-metaflow 2.11.4.9__py2.py3-none-any.whl → 2.11.8.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (38) hide show
  1. metaflow/cli.py +15 -10
  2. metaflow/clone_util.py +71 -0
  3. metaflow/cmd/develop/stub_generator.py +2 -0
  4. metaflow/cmd/develop/stubs.py +17 -8
  5. metaflow/metaflow_config.py +3 -0
  6. metaflow/package.py +4 -3
  7. metaflow/parameters.py +2 -2
  8. metaflow/plugins/aws/batch/batch.py +12 -0
  9. metaflow/plugins/aws/batch/batch_cli.py +25 -0
  10. metaflow/plugins/aws/batch/batch_client.py +40 -0
  11. metaflow/plugins/aws/batch/batch_decorator.py +32 -1
  12. metaflow/plugins/aws/step_functions/step_functions.py +3 -0
  13. metaflow/plugins/datatools/s3/s3op.py +4 -3
  14. metaflow/plugins/env_escape/client.py +154 -27
  15. metaflow/plugins/env_escape/client_modules.py +15 -47
  16. metaflow/plugins/env_escape/configurations/emulate_test_lib/overrides.py +31 -42
  17. metaflow/plugins/env_escape/configurations/emulate_test_lib/server_mappings.py +8 -3
  18. metaflow/plugins/env_escape/configurations/test_lib_impl/test_lib.py +74 -22
  19. metaflow/plugins/env_escape/consts.py +1 -0
  20. metaflow/plugins/env_escape/exception_transferer.py +46 -112
  21. metaflow/plugins/env_escape/override_decorators.py +8 -8
  22. metaflow/plugins/env_escape/server.py +42 -5
  23. metaflow/plugins/env_escape/stub.py +168 -23
  24. metaflow/plugins/env_escape/utils.py +3 -3
  25. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +3 -2
  26. metaflow/plugins/pypi/conda_environment.py +9 -0
  27. metaflow/plugins/pypi/pip.py +17 -2
  28. metaflow/runtime.py +252 -61
  29. metaflow/sidecar/sidecar.py +11 -1
  30. metaflow/sidecar/sidecar_subprocess.py +34 -18
  31. metaflow/task.py +28 -54
  32. metaflow/version.py +1 -1
  33. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/METADATA +2 -2
  34. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/RECORD +38 -37
  35. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/WHEEL +1 -1
  36. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/LICENSE +0 -0
  37. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/entry_points.txt +0 -0
  38. {ob_metaflow-2.11.4.9.dist-info → ob_metaflow-2.11.8.1.dist-info}/top_level.txt +0 -0
@@ -13,12 +13,12 @@ def get_methods(class_object):
13
13
  for base_class in mros:
14
14
  all_attributes.update(base_class.__dict__)
15
15
  for name, attribute in all_attributes.items():
16
- if hasattr(attribute, "__call__"):
17
- all_methods[name] = inspect.getdoc(attribute)
18
- elif isinstance(attribute, staticmethod):
16
+ if isinstance(attribute, staticmethod):
19
17
  all_methods["___s___%s" % name] = inspect.getdoc(attribute)
20
18
  elif isinstance(attribute, classmethod):
21
19
  all_methods["___c___%s" % name] = inspect.getdoc(attribute)
20
+ elif hasattr(attribute, "__call__"):
21
+ all_methods[name] = inspect.getdoc(attribute)
22
22
  return all_methods
23
23
 
24
24
 
@@ -90,7 +90,7 @@ class GcpSecretManagerSecretsProvider(SecretsProvider):
90
90
  # The latter two forms require METAFLOW_GCP_SECRET_MANAGER_PREFIX to be set.
91
91
 
92
92
  match_full = re.match(r"^projects/\d+/secrets/([\w\-]+)(/versions/([\w\-]+))?$", secret_id)
93
- match_partial = re.match(r"^[\w\-]+(/versions/[\w\-]+)?$", secret_id)
93
+ match_partial = re.match(r"^([\w\-]+)(/versions/[\w\-]+)?$", secret_id)
94
94
  if match_full:
95
95
  # Full path
96
96
  env_var_name = match_full.group(1)
@@ -107,9 +107,10 @@ class GcpSecretManagerSecretsProvider(SecretsProvider):
107
107
  raise ValueError(
108
108
  f"Cannot use simple secret_id without setting METAFLOW_GCP_SECRET_MANAGER_PREFIX. {GCP_SECRET_MANAGER_PREFIX}"
109
109
  )
110
- if match_partial.group(1):
110
+ if match_partial.group(2):
111
111
  # With version specified
112
112
  full_secret_name = f"{GCP_SECRET_MANAGER_PREFIX}{secret_id}"
113
+ env_var_name = match_partial.group(1)
113
114
  else:
114
115
  # No version specified, use latest
115
116
  full_secret_name = (
@@ -282,6 +282,15 @@ class CondaEnvironment(MetaflowEnvironment):
282
282
  # Match PyPI and Conda python versions with the resolved environment Python.
283
283
  environment["pypi"]["python"] = environment["conda"]["python"] = env_python
284
284
 
285
+ # When using `Application Default Credentials` for private GCP
286
+ # PyPI registries, the usage of environment variable `GOOGLE_APPLICATION_CREDENTIALS`
287
+ # demands that `keyrings.google-artifactregistry-auth` has to be installed
288
+ # and available in the underlying python environment.
289
+ if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
290
+ environment["conda"]["packages"][
291
+ "keyrings.google-artifactregistry-auth"
292
+ ] = ">=1.1.1"
293
+
285
294
  # Z combinator for a recursive lambda
286
295
  deep_sort = (lambda f: f(f))(
287
296
  lambda f: lambda obj: (
@@ -92,7 +92,14 @@ class Pip(object):
92
92
  # so using @branch as a version acts as expected.
93
93
  vcs_info = dl_info.get("vcs_info")
94
94
  if vcs_info:
95
- res["url"] = "{vcs}+{url}@{commit_id}".format(**vcs_info, **res)
95
+ subdirectory = dl_info.get("subdirectory")
96
+ res["url"] = "{vcs}+{url}@{commit_id}{subdir_str}".format(
97
+ **vcs_info,
98
+ **res,
99
+ subdir_str="#subdirectory=%s" % subdirectory
100
+ if subdirectory
101
+ else ""
102
+ )
96
103
  # used to deduplicate the storage location in case wheel does not
97
104
  # build with enough unique identifiers.
98
105
  res["hash"] = vcs_info["commit_id"]
@@ -270,9 +277,17 @@ class Pip(object):
270
277
  prefix,
271
278
  "pip3",
272
279
  "--disable-pip-version-check",
273
- "--no-input",
274
280
  "--no-color",
275
281
  ]
282
+ # credentials are being determined from the JSON file referenced by
283
+ # the GOOGLE_APPLICATION_CREDENTIALS environment variable and are
284
+ # probably injected dynamically via `keyrings.google-artifactregistry-auth`
285
+ # Thus, we avoid passing `--no-input` in this case.
286
+ + (
287
+ ["--no-input"]
288
+ if os.getenv("GOOGLE_APPLICATION_CREDENTIALS") is None
289
+ else []
290
+ )
276
291
  + (["--isolated"] if isolated else [])
277
292
  + args,
278
293
  stderr=subprocess.PIPE,
metaflow/runtime.py CHANGED
@@ -13,6 +13,7 @@ import subprocess
13
13
  from datetime import datetime
14
14
  from io import BytesIO
15
15
  from functools import partial
16
+ from concurrent import futures
16
17
 
17
18
  from metaflow.datastore.exceptions import DataException
18
19
 
@@ -30,6 +31,7 @@ from .debug import debug
30
31
  from .decorators import flow_decorators
31
32
  from .mflog import mflog, RUNTIME_LOG_SOURCE
32
33
  from .util import to_unicode, compress_list, unicode_type
34
+ from .clone_util import clone_task_helper
33
35
  from .unbounded_foreach import (
34
36
  CONTROL_TASK_TAG,
35
37
  UBF_CONTROL,
@@ -46,6 +48,7 @@ PROGRESS_INTERVAL = 300 # s
46
48
  # executing a flow. These are prefetched during the resume operation by
47
49
  # leveraging the TaskDataStoreSet.
48
50
  PREFETCH_DATA_ARTIFACTS = ["_foreach_stack", "_task_ok", "_transition"]
51
+ RESUME_POLL_SECONDS = 60
49
52
 
50
53
  # Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
51
54
  # formats according to mflog. See a comment in mflog.__init__
@@ -75,8 +78,8 @@ class NativeRuntime(object):
75
78
  max_workers=MAX_WORKERS,
76
79
  max_num_splits=MAX_NUM_SPLITS,
77
80
  max_log_size=MAX_LOG_SIZE,
81
+ resume_identifier=None,
78
82
  ):
79
-
80
83
  if run_id is None:
81
84
  self._run_id = metadata.new_run_id()
82
85
  else:
@@ -101,11 +104,13 @@ class NativeRuntime(object):
101
104
  self._entrypoint = entrypoint
102
105
  self.event_logger = event_logger
103
106
  self._monitor = monitor
107
+ self._resume_identifier = resume_identifier
104
108
 
105
109
  self._clone_run_id = clone_run_id
106
110
  self._clone_only = clone_only
107
111
  self._clone_steps = {} if clone_steps is None else clone_steps
108
112
  self._reentrant = reentrant
113
+ self._run_url = None
109
114
 
110
115
  self._origin_ds_set = None
111
116
  if clone_run_id:
@@ -184,7 +189,8 @@ class NativeRuntime(object):
184
189
  origin_ds_set=self._origin_ds_set,
185
190
  decos=decos,
186
191
  logger=self._logger,
187
- **kwargs
192
+ resume_identifier=self._resume_identifier,
193
+ **kwargs,
188
194
  )
189
195
 
190
196
  @property
@@ -192,25 +198,25 @@ class NativeRuntime(object):
192
198
  return self._run_id
193
199
 
194
200
  def persist_constants(self, task_id=None):
195
- task = self._new_task("_parameters", task_id=task_id)
196
- if not task.is_cloned:
197
- task.persist(self._flow)
198
- self._params_task = task.path
199
- self._is_cloned[task.path] = task.is_cloned
201
+ self._params_task = self._new_task("_parameters", task_id=task_id)
202
+ if not self._params_task.is_cloned:
203
+ self._params_task.persist(self._flow)
200
204
 
201
- def execute(self):
202
- run_url = (
205
+ self._is_cloned[self._params_task.path] = self._params_task.is_cloned
206
+
207
+ def print_workflow_info(self):
208
+ self._run_url = (
203
209
  "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
204
210
  if UI_URL
205
211
  else None
206
212
  )
207
213
 
208
- if run_url:
214
+ if self._run_url:
209
215
  self._logger(
210
216
  "Workflow starting (run-id %s), see it in the UI at %s"
211
217
  % (
212
218
  self._run_id,
213
- run_url,
219
+ self._run_url,
214
220
  ),
215
221
  system_msg=True,
216
222
  )
@@ -219,10 +225,88 @@ class NativeRuntime(object):
219
225
  "Workflow starting (run-id %s):" % self._run_id, system_msg=True
220
226
  )
221
227
 
228
+ def _should_skip_clone_only_execution(self):
229
+ if self._clone_only and self._params_task:
230
+ if self._params_task.resume_done():
231
+ return True, "Resume already complete. Skip clone-only execution."
232
+ if not self._params_task.is_resume_leader():
233
+ return (
234
+ True,
235
+ "Not resume leader under resume execution. Skip clone-only execution.",
236
+ )
237
+ return False, None
238
+
239
+ def clone_task(self, step_name, task_id):
240
+ self._logger(
241
+ "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
242
+ self._flow.name,
243
+ self._clone_run_id,
244
+ step_name,
245
+ task_id,
246
+ self._flow.name,
247
+ self._run_id,
248
+ step_name,
249
+ task_id,
250
+ ),
251
+ system_msg=True,
252
+ )
253
+ clone_task_helper(
254
+ self._flow.name,
255
+ self._clone_run_id,
256
+ self._run_id,
257
+ step_name,
258
+ task_id, # origin_task_id
259
+ task_id,
260
+ self._flow_datastore,
261
+ self._metadata,
262
+ origin_ds_set=self._origin_ds_set,
263
+ )
264
+
265
+ def clone_original_run(self):
266
+ (
267
+ should_skip_clone_only_execution,
268
+ skip_reason,
269
+ ) = self._should_skip_clone_only_execution()
270
+ if should_skip_clone_only_execution:
271
+ self._logger(skip_reason, system_msg=True)
272
+ return
273
+ self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
274
+ self._logger(
275
+ "Start cloning original run: {}/{}".format(
276
+ self._flow.name, self._clone_run_id
277
+ ),
278
+ system_msg=True,
279
+ )
280
+
281
+ inputs = []
282
+
283
+ for task_ds in self._origin_ds_set:
284
+ _, step_name, task_id = task_ds.pathspec.split("/")
285
+ if task_ds["_task_ok"] and step_name != "_parameters":
286
+ inputs.append((step_name, task_id))
287
+
288
+ with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
289
+ all_tasks = [
290
+ executor.submit(self.clone_task, step_name, task_id)
291
+ for (step_name, task_id) in inputs
292
+ ]
293
+ _, _ = futures.wait(all_tasks)
294
+ self._logger("Cloning original run is done", system_msg=True)
295
+ self._params_task.mark_resume_done()
296
+ self._metadata.stop_heartbeat()
297
+
298
+ def execute(self):
299
+ (
300
+ should_skip_clone_only_execution,
301
+ skip_reason,
302
+ ) = self._should_skip_clone_only_execution()
303
+ if should_skip_clone_only_execution:
304
+ self._logger(skip_reason, system_msg=True)
305
+ return
222
306
  self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
223
307
 
224
308
  if self._params_task:
225
- self._queue_push("start", {"input_paths": [self._params_task]})
309
+ self._queue_push("start", {"input_paths": [self._params_task.path]})
226
310
  else:
227
311
  self._queue_push("start", {})
228
312
 
@@ -303,9 +387,9 @@ class NativeRuntime(object):
303
387
 
304
388
  # assert that end was executed and it was successful
305
389
  if ("end", ()) in self._finished:
306
- if run_url:
390
+ if self._run_url:
307
391
  self._logger(
308
- "Done! See the run in the UI at %s" % run_url,
392
+ "Done! See the run in the UI at %s" % self._run_url,
309
393
  system_msg=True,
310
394
  )
311
395
  else:
@@ -316,6 +400,7 @@ class NativeRuntime(object):
316
400
  "cloned; no new tasks executed!",
317
401
  system_msg=True,
318
402
  )
403
+ self._params_task.mark_resume_done()
319
404
  else:
320
405
  raise MetaflowInternalError(
321
406
  "The *end* step was not successful by the end of flow."
@@ -711,6 +796,7 @@ class Task(object):
711
796
  ubf_iter=None,
712
797
  join_type=None,
713
798
  task_id=None,
799
+ resume_identifier=None,
714
800
  ):
715
801
 
716
802
  self.step = step
@@ -749,32 +835,38 @@ class Task(object):
749
835
  self.datastore_sysroot = flow_datastore.datastore_root
750
836
  self._results_ds = None
751
837
 
838
+ # Only used in clone-only resume.
839
+ self._is_resume_leader = None
840
+ self._resume_done = None
841
+ self._resume_identifier = resume_identifier
842
+
752
843
  origin = None
753
844
  if clone_run_id and may_clone:
754
845
  origin = self._find_origin_task(clone_run_id, join_type)
755
846
  if origin and origin["_task_ok"]:
756
847
  # At this point, we know we are going to clone
757
848
  self._is_cloned = True
849
+
850
+ task_id_exists_already = False
851
+ task_completed = False
758
852
  if reentrant:
759
853
  # A re-entrant clone basically allows multiple concurrent processes
760
854
  # to perform the clone at the same time to the same new run id. Let's
761
855
  # assume two processes A and B both simultaneously calling
762
856
  # `resume --reentrant --run-id XX`.
763
- # For each task that is cloned, we want to guarantee that:
764
- # - one and only one of A or B will do the actual cloning
765
- # - the other process (or other processes) will block until the cloning
766
- # is complete.
767
- # This ensures that the rest of the clone algorithm can proceed as normal
768
- # and also guarantees that we only write once to the datastore and
769
- # metadata.
857
+ # We want to guarantee that:
858
+ # - All incomplete tasks are cloned exactly once.
859
+ # To achieve this, we will select a resume leader and let it clone the
860
+ # entire execution graph. This ensures that we only write once to the
861
+ # datastore and metadata.
770
862
  #
771
- # To accomplish this, we use the cloned task's task-id as the "key" to
772
- # synchronize on. We then try to "register" this new task-id (or rather
773
- # the full pathspec <run>/<step>/<taskid>) with the metadata service
774
- # which will indicate if we actually registered it or if it existed
775
- # already. If we did manage to register it, we are the "elected cloner"
863
+ # We use the cloned _parameter task's task-id as the "key" to synchronize
864
+ # on. We try to "register" this new task-id (or rather the full pathspec
865
+ # <run>/<step>/<taskid>) with the metadata service which will indicate
866
+ # if we actually registered it or if it existed already. If we did manage
867
+ # to register it (_parameter task), we are the "elected resume leader"
776
868
  # in essence and proceed to clone. If we didn't, we just wait to make
777
- # sure the task is fully done (ie: the clone is finished).
869
+ # sure the entire clone execution is fully done (ie: the clone is finished).
778
870
  if task_id is not None:
779
871
  # Sanity check -- this should never happen. We cannot allow
780
872
  # for explicit task-ids because in the reentrant case, we use the
@@ -783,6 +875,15 @@ class Task(object):
783
875
  "Reentrant clone-only resume does not allow for explicit task-id"
784
876
  )
785
877
 
878
+ if resume_identifier:
879
+ self.log(
880
+ "Resume identifier is %s." % resume_identifier,
881
+ system_msg=True,
882
+ )
883
+ else:
884
+ raise MetaflowInternalError(
885
+ "Reentrant clone-only resume needs a resume identifier."
886
+ )
786
887
  # We will use the same task_id as the original task
787
888
  # to use it effectively as a synchronization key
788
889
  clone_task_id = origin.task_id
@@ -798,56 +899,124 @@ class Task(object):
798
899
 
799
900
  # If _get_task_id returns True it means the task already existed, so
800
901
  # we wait for it.
801
- self._wait_for_clone = self._get_task_id(clone_task_id)
902
+ task_id_exists_already = self._get_task_id(clone_task_id)
903
+
904
+ # We may not have access to task datastore on first resume attempt, but
905
+ # on later resume attempt, we should check if the resume task is complete
906
+ # or not. This is to fix the issue where the resume leader was killed
907
+ # unexpectedly during cloning and never mark task complete.
908
+ try:
909
+ task_completed = self.results["_task_ok"]
910
+ except DataException as e:
911
+ pass
802
912
  else:
803
- self._wait_for_clone = False
804
913
  self._get_task_id(task_id)
805
914
 
806
915
  # Store the mapping from current_pathspec -> origin_pathspec which
807
916
  # will be useful for looking up origin_ds_set in find_origin_task.
808
917
  self.clone_pathspec_mapping[self._path] = origin.pathspec
809
918
  if self.step == "_parameters":
810
- # We don't put _parameters on the queue so we either clone it or wait
811
- # for it.
812
- if not self._wait_for_clone:
919
+ # In the _parameters task, we need to resolve who is the resume leader.
920
+ self._is_resume_leader = False
921
+ resume_leader = None
922
+
923
+ if task_id_exists_already:
924
+ # If the task id already exists, we need to check if current task is the resume leader in previous attempt.
925
+ ds = self._flow_datastore.get_task_datastore(
926
+ self.run_id, self.step, self.task_id
927
+ )
928
+ if not ds["_task_ok"]:
929
+ raise MetaflowInternalError(
930
+ "Externally cloned _parameters task did not succeed"
931
+ )
932
+
933
+ # Check if we should be the resume leader (maybe from previous attempt).
934
+ # To avoid the edge case where the resume leader is selected but has not
935
+ # yet written the _resume_leader metadata, we will wait for a few seconds.
936
+ # We will wait for resume leader for at most 3 times.
937
+ for resume_leader_wait_retry in range(3):
938
+
939
+ if ds.has_metadata("_resume_leader", add_attempt=False):
940
+ resume_leader = ds.load_metadata(
941
+ ["_resume_leader"], add_attempt=False
942
+ )["_resume_leader"]
943
+ self._is_resume_leader = resume_leader == resume_identifier
944
+ else:
945
+ self.log(
946
+ "Waiting for resume leader to be selected. Sleeping ...",
947
+ system_msg=True,
948
+ )
949
+ time.sleep(3)
950
+ else:
951
+ # If the task id does not exist, current task is the resume leader.
952
+ resume_leader = resume_identifier
953
+ self._is_resume_leader = True
954
+
955
+ if reentrant:
956
+ if resume_leader:
957
+ self.log(
958
+ "Resume leader is %s." % resume_leader,
959
+ system_msg=True,
960
+ )
961
+ else:
962
+ raise MetaflowInternalError(
963
+ "Can not determine the resume leader in distributed resume mode."
964
+ )
965
+
966
+ if self._is_resume_leader:
967
+ self.log(
968
+ "Selected as the reentrant clone leader.",
969
+ system_msg=True,
970
+ )
813
971
  # Clone in place without relying on run_queue.
814
972
  self.new_attempt()
815
973
  self._ds.clone(origin)
974
+ # Set the resume leader be the task that calls the resume (first task to clone _parameters task).
975
+ # We will always set resume leader regardless whether we are in distributed resume case or not.
976
+ if resume_identifier:
977
+ self._ds.save_metadata(
978
+ {"_resume_leader": resume_identifier}, add_attempt=False
979
+ )
980
+
816
981
  self._ds.done()
817
982
  else:
818
- # TODO: There is a bit of a duplication with the task.py
819
- # clone_only function here
820
- self.log(
821
- "Waiting for clone of _parameters step to occur...",
822
- system_msg=True,
823
- )
983
+ # Wait for the resume leader to complete
824
984
  while True:
825
- try:
826
- ds = self._flow_datastore.get_task_datastore(
827
- self.run_id, self.step, self.task_id
828
- )
829
- if not ds["_task_ok"]:
830
- raise MetaflowInternalError(
831
- "Externally cloned _parameters task did not succeed"
832
- )
985
+ ds = self._flow_datastore.get_task_datastore(
986
+ self.run_id, self.step, self.task_id
987
+ )
988
+
989
+ # Check if resume is complete. Resume leader will write the done file.
990
+ self._resume_done = ds.has_metadata(
991
+ "_resume_done", add_attempt=False
992
+ )
993
+
994
+ if self._resume_done:
833
995
  break
834
- except DataException:
835
- self.log("Sleeping for 5s...", system_msg=True)
836
- # No need to get fancy with the sleep here.
837
- time.sleep(5)
838
- self.log("_parameters clone successful", system_msg=True)
996
+
997
+ self.log(
998
+ "Waiting for resume leader to complete. Sleeping for %ds..."
999
+ % RESUME_POLL_SECONDS,
1000
+ system_msg=True,
1001
+ )
1002
+ time.sleep(RESUME_POLL_SECONDS)
1003
+ self.log(
1004
+ "_parameters clone completed by resume leader", system_msg=True
1005
+ )
839
1006
  else:
840
- # For non parameter steps
1007
+ # Only leader can reach non-parameter steps in resume.
1008
+
841
1009
  # Store the origin pathspec in clone_origin so this can be run
842
1010
  # as a task by the runtime.
843
1011
  self.clone_origin = origin.pathspec
844
1012
  # Save a call to creating the results_ds since its same as origin.
845
1013
  self._results_ds = origin
846
- if self._wait_for_clone:
1014
+
1015
+ # If the task is already completed in new run, we don't need to clone it.
1016
+ self._should_skip_cloning = task_completed
1017
+ if self._should_skip_cloning:
847
1018
  self.log(
848
- "Waiting for the successful cloning of results "
849
- "of a previously run task %s (this may take some time)"
850
- % self.clone_origin,
1019
+ "Skip cloning of previously run task %s" % self.clone_origin,
851
1020
  system_msg=True,
852
1021
  )
853
1022
  else:
@@ -916,6 +1085,31 @@ class Task(object):
916
1085
  self._logger(msg, head=prefix, system_msg=system_msg, timestamp=timestamp)
917
1086
  sys.stdout.flush()
918
1087
 
1088
+ def is_resume_leader(self):
1089
+ assert (
1090
+ self.step == "_parameters"
1091
+ ), "Only _parameters step can check resume leader."
1092
+ return self._is_resume_leader
1093
+
1094
+ def resume_done(self):
1095
+ assert (
1096
+ self.step == "_parameters"
1097
+ ), "Only _parameters step can check wheather resume is complete."
1098
+ return self._resume_done
1099
+
1100
+ def mark_resume_done(self):
1101
+ assert (
1102
+ self.step == "_parameters"
1103
+ ), "Only _parameters step can mark resume as done."
1104
+ assert self.is_resume_leader(), "Only resume leader can mark resume as done."
1105
+
1106
+ # Mark the resume as done. This is called at the end of the resume flow and after
1107
+ # the _parameters step was successfully cloned, so we need to 'dangerously' save
1108
+ # this done file, but the risk should be minimal.
1109
+ self._ds._dangerous_save_metadata_post_done(
1110
+ {"_resume_done": True}, add_attempt=False
1111
+ )
1112
+
919
1113
  def _get_task_id(self, task_id):
920
1114
  already_existed = True
921
1115
  if self.ubf_context == UBF_CONTROL:
@@ -1040,8 +1234,8 @@ class Task(object):
1040
1234
  return self._is_cloned
1041
1235
 
1042
1236
  @property
1043
- def wait_for_clone(self):
1044
- return self._wait_for_clone
1237
+ def should_skip_cloning(self):
1238
+ return self._should_skip_cloning
1045
1239
 
1046
1240
  def persist(self, flow):
1047
1241
  # this is used to persist parameters before the start step
@@ -1185,7 +1379,6 @@ class CLIArgs(object):
1185
1379
 
1186
1380
  class Worker(object):
1187
1381
  def __init__(self, task, max_logs_size):
1188
-
1189
1382
  self.task = task
1190
1383
  self._proc = self._launch()
1191
1384
 
@@ -1229,8 +1422,6 @@ class Worker(object):
1229
1422
  # disabling sidecars for cloned tasks due to perf reasons
1230
1423
  args.top_level_options["event-logger"] = "nullSidecarLogger"
1231
1424
  args.top_level_options["monitor"] = "nullSidecarMonitor"
1232
- if self.task.wait_for_clone:
1233
- args.command_options["clone-wait-only"] = True
1234
1425
  else:
1235
1426
  # decorators may modify the CLIArgs object in-place
1236
1427
  for deco in self.task.decos:
@@ -13,14 +13,24 @@ class Sidecar(object):
13
13
  if t is not None and t.get_worker() is not None:
14
14
  self._has_valid_worker = True
15
15
  self.sidecar_process = None
16
+ # Whether to send msg in a thread-safe fashion.
17
+ self._threadsafe_send_enabled = False
16
18
 
17
19
  def start(self):
18
20
  if not self.is_active and self._has_valid_worker:
19
21
  self.sidecar_process = SidecarSubProcess(self._sidecar_type)
20
22
 
23
+ def enable_threadsafe_send(self):
24
+ self._threadsafe_send_enabled = True
25
+
26
+ def disable_threadsafe_send(self):
27
+ self._threadsafe_send_enabled = False
28
+
21
29
  def send(self, msg):
22
30
  if self.is_active:
23
- self.sidecar_process.send(msg)
31
+ self.sidecar_process.send(
32
+ msg, thread_safe_send=self._threadsafe_send_enabled
33
+ )
24
34
 
25
35
  def terminate(self):
26
36
  if self.is_active: