metaflow 2.12.34__py2.py3-none-any.whl → 2.12.36__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -377,6 +377,8 @@ KUBERNETES_PORT = from_conf("KUBERNETES_PORT", None)
377
377
  KUBERNETES_CPU = from_conf("KUBERNETES_CPU", None)
378
378
  KUBERNETES_MEMORY = from_conf("KUBERNETES_MEMORY", None)
379
379
  KUBERNETES_DISK = from_conf("KUBERNETES_DISK", None)
380
+ # Default kubernetes QoS class
381
+ KUBERNETES_QOS = from_conf("KUBERNETES_QOS", "burstable")
380
382
 
381
383
  ARGO_WORKFLOWS_KUBERNETES_SECRETS = from_conf("ARGO_WORKFLOWS_KUBERNETES_SECRETS", "")
382
384
  ARGO_WORKFLOWS_ENV_VARS_TO_SKIP = from_conf("ARGO_WORKFLOWS_ENV_VARS_TO_SKIP", "")
@@ -46,6 +46,7 @@ from metaflow.parameters import (
46
46
  # TODO: Move chevron to _vendor
47
47
  from metaflow.plugins.cards.card_modules import chevron
48
48
  from metaflow.plugins.kubernetes.kubernetes import Kubernetes
49
+ from metaflow.plugins.kubernetes.kube_utils import qos_requests_and_limits
49
50
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
50
51
  from metaflow.util import compress_list, dict_to_cli_options, get_username
51
52
 
@@ -428,25 +429,25 @@ class Airflow(object):
428
429
  if k8s_deco.attributes["namespace"] is not None
429
430
  else "default"
430
431
  )
431
-
432
+ qos_requests, qos_limits = qos_requests_and_limits(
433
+ k8s_deco.attributes["qos"],
434
+ k8s_deco.attributes["cpu"],
435
+ k8s_deco.attributes["memory"],
436
+ k8s_deco.attributes["disk"],
437
+ )
432
438
  resources = dict(
433
- requests={
434
- "cpu": k8s_deco.attributes["cpu"],
435
- "memory": "%sM" % str(k8s_deco.attributes["memory"]),
436
- "ephemeral-storage": str(k8s_deco.attributes["disk"]),
437
- }
439
+ requests=qos_requests,
440
+ limits={
441
+ **qos_limits,
442
+ **{
443
+ "%s.com/gpu".lower()
444
+ % k8s_deco.attributes["gpu_vendor"]: str(k8s_deco.attributes["gpu"])
445
+ for k in [0]
446
+ # Don't set GPU limits if gpu isn't specified.
447
+ if k8s_deco.attributes["gpu"] is not None
448
+ },
449
+ },
438
450
  )
439
- if k8s_deco.attributes["gpu"] is not None:
440
- resources.update(
441
- dict(
442
- limits={
443
- "%s.com/gpu".lower()
444
- % k8s_deco.attributes["gpu_vendor"]: str(
445
- k8s_deco.attributes["gpu"]
446
- )
447
- }
448
- )
449
- )
450
451
 
451
452
  annotations = {
452
453
  "metaflow/production_token": self.production_token,
@@ -53,6 +53,7 @@ from metaflow.metaflow_config import (
53
53
  from metaflow.metaflow_config_funcs import config_values
54
54
  from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars
55
55
  from metaflow.parameters import deploy_time_eval
56
+ from metaflow.plugins.kubernetes.kube_utils import qos_requests_and_limits
56
57
  from metaflow.plugins.kubernetes.kubernetes import (
57
58
  parse_kube_keyvalue_list,
58
59
  validate_kube_labels,
@@ -1842,6 +1843,13 @@ class ArgoWorkflows(object):
1842
1843
  if tmpfs_enabled and tmpfs_tempdir:
1843
1844
  env["METAFLOW_TEMPDIR"] = tmpfs_path
1844
1845
 
1846
+ qos_requests, qos_limits = qos_requests_and_limits(
1847
+ resources["qos"],
1848
+ resources["cpu"],
1849
+ resources["memory"],
1850
+ resources["disk"],
1851
+ )
1852
+
1845
1853
  # Create a ContainerTemplate for this node. Ideally, we would have
1846
1854
  # liked to inline this ContainerTemplate and avoid scanning the workflow
1847
1855
  # twice, but due to issues with variable substitution, we will have to
@@ -1905,6 +1913,7 @@ class ArgoWorkflows(object):
1905
1913
  persistent_volume_claims=resources["persistent_volume_claims"],
1906
1914
  shared_memory=shared_memory,
1907
1915
  port=port,
1916
+ qos=resources["qos"],
1908
1917
  )
1909
1918
 
1910
1919
  for k, v in env.items():
@@ -2090,17 +2099,17 @@ class ArgoWorkflows(object):
2090
2099
  image=resources["image"],
2091
2100
  image_pull_policy=resources["image_pull_policy"],
2092
2101
  resources=kubernetes_sdk.V1ResourceRequirements(
2093
- requests={
2094
- "cpu": str(resources["cpu"]),
2095
- "memory": "%sM" % str(resources["memory"]),
2096
- "ephemeral-storage": "%sM"
2097
- % str(resources["disk"]),
2098
- },
2102
+ requests=qos_requests,
2099
2103
  limits={
2100
- "%s.com/gpu".lower()
2101
- % resources["gpu_vendor"]: str(resources["gpu"])
2102
- for k in [0]
2103
- if resources["gpu"] is not None
2104
+ **qos_limits,
2105
+ **{
2106
+ "%s.com/gpu".lower()
2107
+ % resources["gpu_vendor"]: str(
2108
+ resources["gpu"]
2109
+ )
2110
+ for k in [0]
2111
+ if resources["gpu"] is not None
2112
+ },
2104
2113
  },
2105
2114
  ),
2106
2115
  # Configure secrets
@@ -2337,7 +2346,7 @@ class ArgoWorkflows(object):
2337
2346
  "memory": "500Mi",
2338
2347
  },
2339
2348
  ),
2340
- )
2349
+ ).to_dict()
2341
2350
  )
2342
2351
  ),
2343
2352
  Template("capture-error-hook-fn-preflight").steps(
@@ -2688,7 +2697,7 @@ class ArgoWorkflows(object):
2688
2697
  },
2689
2698
  ),
2690
2699
  )
2691
- )
2700
+ ).to_dict()
2692
2701
  )
2693
2702
  )
2694
2703
 
@@ -2858,7 +2867,7 @@ class ArgoWorkflows(object):
2858
2867
  "memory": "250Mi",
2859
2868
  },
2860
2869
  ),
2861
- )
2870
+ ).to_dict()
2862
2871
  )
2863
2872
  )
2864
2873
  .service_account_name(ARGO_EVENTS_SERVICE_ACCOUNT)
@@ -23,3 +23,32 @@ def parse_cli_options(flow_name, run_id, user, my_runs, echo):
23
23
  raise CommandException("A previous run id was not found. Specify --run-id.")
24
24
 
25
25
  return flow_name, run_id, user
26
+
27
+
28
+ def qos_requests_and_limits(qos: str, cpu: int, memory: int, storage: int):
29
+ "return resource requests and limits for the kubernetes pod based on the given QoS Class"
30
+ # case insensitive matching for QoS class
31
+ qos = qos.lower()
32
+ # Determine the requests and limits to define chosen QoS class
33
+ qos_limits = {}
34
+ qos_requests = {}
35
+ if qos == "guaranteed":
36
+ # Guaranteed - has both cpu/memory limits. requests not required, as these will be inferred.
37
+ qos_limits = {
38
+ "cpu": str(cpu),
39
+ "memory": "%sM" % str(memory),
40
+ "ephemeral-storage": "%sM" % str(storage),
41
+ }
42
+ # NOTE: Even though Kubernetes will produce matching requests for the specified limits, this happens late in the lifecycle.
43
+ # We specify them explicitly here to make some K8S tooling happy, in case they rely on .resources.requests being present at time of submitting the job.
44
+ qos_requests = qos_limits
45
+ else:
46
+ # Burstable - not Guaranteed, and has a memory/cpu limit or request
47
+ qos_requests = {
48
+ "cpu": str(cpu),
49
+ "memory": "%sM" % str(memory),
50
+ "ephemeral-storage": "%sM" % str(storage),
51
+ }
52
+ # TODO: Add support for BestEffort once there is a use case for it.
53
+ # BestEffort - no limit or requests for cpu/memory
54
+ return qos_requests, qos_limits
@@ -196,6 +196,7 @@ class Kubernetes(object):
196
196
  shared_memory=None,
197
197
  port=None,
198
198
  num_parallel=None,
199
+ qos=None,
199
200
  ):
200
201
  name = "js-%s" % str(uuid4())[:6]
201
202
  jobset = (
@@ -228,6 +229,7 @@ class Kubernetes(object):
228
229
  shared_memory=shared_memory,
229
230
  port=port,
230
231
  num_parallel=num_parallel,
232
+ qos=qos,
231
233
  )
232
234
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
233
235
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
@@ -488,6 +490,7 @@ class Kubernetes(object):
488
490
  shared_memory=None,
489
491
  port=None,
490
492
  name_pattern=None,
493
+ qos=None,
491
494
  ):
492
495
  if env is None:
493
496
  env = {}
@@ -528,6 +531,7 @@ class Kubernetes(object):
528
531
  persistent_volume_claims=persistent_volume_claims,
529
532
  shared_memory=shared_memory,
530
533
  port=port,
534
+ qos=qos,
531
535
  )
532
536
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
533
537
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
@@ -126,6 +126,12 @@ def kubernetes():
126
126
  type=int,
127
127
  help="Number of parallel nodes to run as a multi-node job.",
128
128
  )
129
+ @click.option(
130
+ "--qos",
131
+ default=None,
132
+ type=str,
133
+ help="Quality of Service class for the Kubernetes pod",
134
+ )
129
135
  @click.pass_context
130
136
  def step(
131
137
  ctx,
@@ -154,6 +160,7 @@ def step(
154
160
  shared_memory=None,
155
161
  port=None,
156
162
  num_parallel=None,
163
+ qos=None,
157
164
  **kwargs
158
165
  ):
159
166
  def echo(msg, stream="stderr", job_id=None, **kwargs):
@@ -294,6 +301,7 @@ def step(
294
301
  shared_memory=shared_memory,
295
302
  port=port,
296
303
  num_parallel=num_parallel,
304
+ qos=qos,
297
305
  )
298
306
  except Exception as e:
299
307
  traceback.print_exc(chain=False)
@@ -26,6 +26,7 @@ from metaflow.metaflow_config import (
26
26
  KUBERNETES_SERVICE_ACCOUNT,
27
27
  KUBERNETES_SHARED_MEMORY,
28
28
  KUBERNETES_TOLERATIONS,
29
+ KUBERNETES_QOS,
29
30
  )
30
31
  from metaflow.plugins.resources_decorator import ResourcesDecorator
31
32
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
@@ -41,6 +42,8 @@ except NameError:
41
42
  unicode = str
42
43
  basestring = str
43
44
 
45
+ SUPPORTED_KUBERNETES_QOS_CLASSES = ["Guaranteed", "Burstable"]
46
+
44
47
 
45
48
  class KubernetesDecorator(StepDecorator):
46
49
  """
@@ -109,6 +112,8 @@ class KubernetesDecorator(StepDecorator):
109
112
  hostname_resolution_timeout: int, default 10 * 60
110
113
  Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
111
114
  Only applicable when @parallel is used.
115
+ qos: str, default: Burstable
116
+ Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
112
117
  """
113
118
 
114
119
  name = "kubernetes"
@@ -136,6 +141,7 @@ class KubernetesDecorator(StepDecorator):
136
141
  "compute_pool": None,
137
142
  "executable": None,
138
143
  "hostname_resolution_timeout": 10 * 60,
144
+ "qos": KUBERNETES_QOS,
139
145
  }
140
146
  package_url = None
141
147
  package_sha = None
@@ -259,6 +265,17 @@ class KubernetesDecorator(StepDecorator):
259
265
  self.step = step
260
266
  self.flow_datastore = flow_datastore
261
267
 
268
+ if (
269
+ self.attributes["qos"] is not None
270
+ # case insensitive matching.
271
+ and self.attributes["qos"].lower()
272
+ not in [c.lower() for c in SUPPORTED_KUBERNETES_QOS_CLASSES]
273
+ ):
274
+ raise MetaflowException(
275
+ "*%s* is not a valid Kubernetes QoS class. Choose one of the following: %s"
276
+ % (self.attributes["qos"], ", ".join(SUPPORTED_KUBERNETES_QOS_CLASSES))
277
+ )
278
+
262
279
  if any([deco.name == "batch" for deco in decos]):
263
280
  raise MetaflowException(
264
281
  "Step *{step}* is marked for execution both on AWS Batch and "
@@ -15,6 +15,8 @@ from .kubernetes_jobsets import (
15
15
  KubernetesJobSet,
16
16
  ) # We need this import for Kubernetes Client.
17
17
 
18
+ from .kube_utils import qos_requests_and_limits
19
+
18
20
 
19
21
  class KubernetesJobException(MetaflowException):
20
22
  headline = "Kubernetes job error"
@@ -74,6 +76,13 @@ class KubernetesJob(object):
74
76
  if self._kwargs["shared_memory"]
75
77
  else None
76
78
  )
79
+ qos_requests, qos_limits = qos_requests_and_limits(
80
+ self._kwargs["qos"],
81
+ self._kwargs["cpu"],
82
+ self._kwargs["memory"],
83
+ self._kwargs["disk"],
84
+ )
85
+
77
86
  return client.V1JobSpec(
78
87
  # Retries are handled by Metaflow when it is responsible for
79
88
  # executing the flow. The responsibility is moved to Kubernetes
@@ -154,20 +163,18 @@ class KubernetesJob(object):
154
163
  image_pull_policy=self._kwargs["image_pull_policy"],
155
164
  name=self._kwargs["step_name"].replace("_", "-"),
156
165
  resources=client.V1ResourceRequirements(
157
- requests={
158
- "cpu": str(self._kwargs["cpu"]),
159
- "memory": "%sM" % str(self._kwargs["memory"]),
160
- "ephemeral-storage": "%sM"
161
- % str(self._kwargs["disk"]),
162
- },
166
+ requests=qos_requests,
163
167
  limits={
164
- "%s.com/gpu".lower()
165
- % self._kwargs["gpu_vendor"]: str(
166
- self._kwargs["gpu"]
167
- )
168
- for k in [0]
169
- # Don't set GPU limits if gpu isn't specified.
170
- if self._kwargs["gpu"] is not None
168
+ **qos_limits,
169
+ **{
170
+ "%s.com/gpu".lower()
171
+ % self._kwargs["gpu_vendor"]: str(
172
+ self._kwargs["gpu"]
173
+ )
174
+ for k in [0]
175
+ # Don't set GPU limits if gpu isn't specified.
176
+ if self._kwargs["gpu"] is not None
177
+ },
171
178
  },
172
179
  ),
173
180
  volume_mounts=(
@@ -9,6 +9,8 @@ from metaflow.metaflow_config import KUBERNETES_JOBSET_GROUP, KUBERNETES_JOBSET_
9
9
  from metaflow.tracing import inject_tracing_vars
10
10
  from metaflow.metaflow_config import KUBERNETES_SECRETS
11
11
 
12
+ from .kube_utils import qos_requests_and_limits
13
+
12
14
 
13
15
  class KubernetesJobsetException(MetaflowException):
14
16
  headline = "Kubernetes jobset error"
@@ -554,7 +556,12 @@ class JobSetSpec(object):
554
556
  if self._kwargs["shared_memory"]
555
557
  else None
556
558
  )
557
-
559
+ qos_requests, qos_limits = qos_requests_and_limits(
560
+ self._kwargs["qos"],
561
+ self._kwargs["cpu"],
562
+ self._kwargs["memory"],
563
+ self._kwargs["disk"],
564
+ )
558
565
  return dict(
559
566
  name=self.name,
560
567
  template=client.api_client.ApiClient().sanitize_for_serialization(
@@ -653,21 +660,18 @@ class JobSetSpec(object):
653
660
  "_", "-"
654
661
  ),
655
662
  resources=client.V1ResourceRequirements(
656
- requests={
657
- "cpu": str(self._kwargs["cpu"]),
658
- "memory": "%sM"
659
- % str(self._kwargs["memory"]),
660
- "ephemeral-storage": "%sM"
661
- % str(self._kwargs["disk"]),
662
- },
663
+ requests=qos_requests,
663
664
  limits={
664
- "%s.com/gpu".lower()
665
- % self._kwargs["gpu_vendor"]: str(
666
- self._kwargs["gpu"]
667
- )
668
- for k in [0]
669
- # Don't set GPU limits if gpu isn't specified.
670
- if self._kwargs["gpu"] is not None
665
+ **qos_limits,
666
+ **{
667
+ "%s.com/gpu".lower()
668
+ % self._kwargs["gpu_vendor"]: str(
669
+ self._kwargs["gpu"]
670
+ )
671
+ for k in [0]
672
+ # Don't set GPU limits if gpu isn't specified.
673
+ if self._kwargs["gpu"] is not None
674
+ },
671
675
  },
672
676
  ),
673
677
  volume_mounts=(
@@ -45,6 +45,8 @@ class ParallelDecorator(StepDecorator):
45
45
  if ubf_context == UBF_CONTROL:
46
46
  num_parallel = cli_args.task.ubf_iter.num_parallel
47
47
  cli_args.command_options["num-parallel"] = str(num_parallel)
48
+ if os.environ.get("METAFLOW_RUNTIME_ENVIRONMENT", "local") == "local":
49
+ cli_args.command_options["split_index"] = "0"
48
50
 
49
51
  def step_init(
50
52
  self, flow, graph, step_name, decorators, environment, flow_datastore, logger
@@ -126,6 +128,8 @@ class ParallelDecorator(StepDecorator):
126
128
  tags=["attempt_id:{0}".format(0)],
127
129
  )
128
130
  ]
131
+ flow._control_task_is_mapper_zero = True
132
+
129
133
  metadata.register_metadata(run_id, step_name, task_id, task_metadata_list)
130
134
 
131
135
  def task_decorate(
@@ -221,7 +225,6 @@ def _local_multinode_control_task_step_func(
221
225
  "%s/%s/%s" % (run_id, step_name, mapper_task_id)
222
226
  for mapper_task_id in mapper_task_ids
223
227
  ]
224
- flow._control_task_is_mapper_zero = True
225
228
 
226
229
  # run the step function ourselves
227
230
  os.environ["MF_PARALLEL_NODE_INDEX"] = "0"
metaflow/runtime.py CHANGED
@@ -9,6 +9,7 @@ from __future__ import print_function
9
9
  import os
10
10
  import sys
11
11
  import fcntl
12
+ import re
12
13
  import time
13
14
  import subprocess
14
15
  from datetime import datetime
@@ -49,7 +50,13 @@ PROGRESS_INTERVAL = 300 # s
49
50
  # The following is a list of the (data) artifacts used by the runtime while
50
51
  # executing a flow. These are prefetched during the resume operation by
51
52
  # leveraging the TaskDataStoreSet.
52
- PREFETCH_DATA_ARTIFACTS = ["_foreach_stack", "_task_ok", "_transition"]
53
+ PREFETCH_DATA_ARTIFACTS = [
54
+ "_foreach_stack",
55
+ "_task_ok",
56
+ "_transition",
57
+ "_control_mapper_tasks",
58
+ "_control_task_is_mapper_zero",
59
+ ]
53
60
  RESUME_POLL_SECONDS = 60
54
61
 
55
62
  # Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
@@ -269,6 +276,8 @@ class NativeRuntime(object):
269
276
  step_name,
270
277
  task_id,
271
278
  pathspec_index,
279
+ cloned_task_pathspec_index,
280
+ finished_tuple,
272
281
  ubf_context,
273
282
  generate_task_obj,
274
283
  verbose=False,
@@ -281,8 +290,13 @@ class NativeRuntime(object):
281
290
  task.ubf_context = ubf_context
282
291
  new_task_id = task.task_id
283
292
  self._cloned_tasks.append(task)
284
- self._cloned_task_index.add(task.task_index)
285
-
293
+ self._cloned_task_index.add(cloned_task_pathspec_index)
294
+ task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
295
+ else:
296
+ task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
297
+ Task.clone_pathspec_mapping[task_pathspec] = "{}/{}/{}".format(
298
+ self._clone_run_id, step_name, task_id
299
+ )
286
300
  if verbose:
287
301
  self._logger(
288
302
  "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
@@ -308,6 +322,8 @@ class NativeRuntime(object):
308
322
  self._metadata,
309
323
  origin_ds_set=self._origin_ds_set,
310
324
  )
325
+ self._finished[(step_name, finished_tuple)] = task_pathspec
326
+ self._is_cloned[task_pathspec] = True
311
327
  except Exception as e:
312
328
  self._logger(
313
329
  "Cloning {}/{}/{}/{} failed with error: {}".format(
@@ -323,7 +339,8 @@ class NativeRuntime(object):
323
339
 
324
340
  inputs = []
325
341
 
326
- ubf_mapper_tasks_to_clone = []
342
+ ubf_mapper_tasks_to_clone = set()
343
+ ubf_control_tasks = set()
327
344
  # We only clone ubf mapper tasks if the control task is complete.
328
345
  # Here we need to check which control tasks are complete, and then get the corresponding
329
346
  # mapper tasks.
@@ -331,13 +348,25 @@ class NativeRuntime(object):
331
348
  _, step_name, task_id = task_ds.pathspec.split("/")
332
349
  pathspec_index = task_ds.pathspec_index
333
350
  if task_ds["_task_ok"] and step_name != "_parameters":
334
- # Only control task can have _control_mapper_tasks. We then store the corresponding mapepr task pathspecs.
351
+ # Control task contains "_control_mapper_tasks" but, in the case of
352
+ # @parallel decorator, the control task is also a mapper task so we
353
+ # need to distinguish this using _control_task_is_mapper_zero
335
354
  control_mapper_tasks = (
336
355
  []
337
356
  if "_control_mapper_tasks" not in task_ds
338
357
  else task_ds["_control_mapper_tasks"]
339
358
  )
340
- ubf_mapper_tasks_to_clone.extend(control_mapper_tasks)
359
+ if control_mapper_tasks:
360
+ if task_ds.get("_control_task_is_mapper_zero", False):
361
+ # Strip out the control task of list of mapper tasks
362
+ ubf_control_tasks.add(control_mapper_tasks[0])
363
+ ubf_mapper_tasks_to_clone.update(control_mapper_tasks[1:])
364
+ else:
365
+ ubf_mapper_tasks_to_clone.update(control_mapper_tasks)
366
+ # Since we only add mapper tasks here, if we are not in the list
367
+ # we are a control task
368
+ if task_ds.pathspec not in ubf_mapper_tasks_to_clone:
369
+ ubf_control_tasks.add(task_ds.pathspec)
341
370
 
342
371
  for task_ds in self._origin_ds_set:
343
372
  _, step_name, task_id = task_ds.pathspec.split("/")
@@ -350,33 +379,54 @@ class NativeRuntime(object):
350
379
  ):
351
380
  # "_unbounded_foreach" is a special flag to indicate that the transition is an unbounded foreach.
352
381
  # Both parent and splitted children tasks will have this flag set. The splitted control/mapper tasks
353
- # have no "foreach_param" because UBF is always followed by a join step.
382
+ # are not foreach types because UBF is always followed by a join step.
354
383
  is_ubf_task = (
355
384
  "_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
356
- ) and (self._graph[step_name].foreach_param is None)
385
+ ) and (self._graph[step_name].type != "foreach")
357
386
 
358
- # Only the control task has "_control_mapper_tasks" artifact.
359
- is_ubf_control_task = (
360
- is_ubf_task
361
- and ("_control_mapper_tasks" in task_ds)
362
- and task_ds["_control_mapper_tasks"]
363
- )
364
- is_ubf_mapper_tasks = is_ubf_task and (not is_ubf_control_task)
365
- if is_ubf_mapper_tasks and (
387
+ is_ubf_control_task = task_ds.pathspec in ubf_control_tasks
388
+
389
+ is_ubf_mapper_task = is_ubf_task and (not is_ubf_control_task)
390
+
391
+ if is_ubf_mapper_task and (
366
392
  task_ds.pathspec not in ubf_mapper_tasks_to_clone
367
393
  ):
368
- # Skip copying UBF mapper tasks if control tasks is incomplete.
394
+ # Skip copying UBF mapper tasks if control task is incomplete.
369
395
  continue
370
396
 
371
397
  ubf_context = None
372
398
  if is_ubf_task:
373
- ubf_context = "ubf_test" if is_ubf_mapper_tasks else "ubf_control"
399
+ ubf_context = "ubf_test" if is_ubf_mapper_task else "ubf_control"
400
+
401
+ finished_tuple = tuple(
402
+ [s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
403
+ )
404
+ cloned_task_pathspec_index = pathspec_index.split("/")[1]
405
+ if task_ds.get("_control_task_is_mapper_zero", False):
406
+ # Replace None with index 0 for control task as it is part of the
407
+ # UBF (as a mapper as well)
408
+ finished_tuple = finished_tuple[:-1] + (
409
+ finished_tuple[-1]._replace(index=0),
410
+ )
411
+ # We need this reverse override though because when we check
412
+ # if a task has been cloned in _queue_push, the index will be None
413
+ # because the _control_task_is_mapper_zero is set in the control
414
+ # task *itself* and *not* in the one that is launching the UBF nest.
415
+ # This means that _translate_index will use None.
416
+ cloned_task_pathspec_index = re.sub(
417
+ r"(\[(?:\d+, ?)*)0\]",
418
+ lambda m: (m.group(1) or "[") + "None]",
419
+ cloned_task_pathspec_index,
420
+ )
421
+
374
422
  inputs.append(
375
423
  (
376
424
  step_name,
377
425
  task_id,
378
426
  pathspec_index,
379
- is_ubf_mapper_tasks,
427
+ cloned_task_pathspec_index,
428
+ finished_tuple,
429
+ is_ubf_mapper_task,
380
430
  ubf_context,
381
431
  )
382
432
  )
@@ -388,15 +438,19 @@ class NativeRuntime(object):
388
438
  step_name,
389
439
  task_id,
390
440
  pathspec_index,
441
+ cloned_task_pathspec_index,
442
+ finished_tuple,
391
443
  ubf_context=ubf_context,
392
- generate_task_obj=generate_task_obj and (not is_ubf_mapper_tasks),
444
+ generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
393
445
  verbose=verbose,
394
446
  )
395
447
  for (
396
448
  step_name,
397
449
  task_id,
398
450
  pathspec_index,
399
- is_ubf_mapper_tasks,
451
+ cloned_task_pathspec_index,
452
+ finished_tuple,
453
+ is_ubf_mapper_task,
400
454
  ubf_context,
401
455
  ) in inputs
402
456
  ]
@@ -546,7 +600,6 @@ class NativeRuntime(object):
546
600
  # Given the current task information (task_index), the type of transition,
547
601
  # and the split index, return the new task index.
548
602
  def _translate_index(self, task, next_step, type, split_index=None):
549
- import re
550
603
 
551
604
  match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
552
605
  if match:
@@ -640,15 +693,18 @@ class NativeRuntime(object):
640
693
  # If the control task is cloned, all mapper tasks should have been cloned
641
694
  # as well, so we no longer need to handle cloning of mapper tasks in runtime.
642
695
 
643
- # Update _finished since these tasks were successfully
644
- # run elsewhere so that join will be unblocked.
645
- _, foreach_stack = task.finished_id
646
- top = foreach_stack[-1]
647
- bottom = list(foreach_stack[:-1])
648
- for i in range(num_splits):
649
- s = tuple(bottom + [top._replace(index=i)])
650
- self._finished[(task.step, s)] = mapper_tasks[i]
651
- self._is_cloned[mapper_tasks[i]] = False
696
+ # Update _finished if we are not cloned. If we were cloned, we already
697
+ # updated _finished with the new tasks. Note that the *value* of mapper
698
+ # tasks is incorrect and contains the pathspec of the *cloned* run
699
+ # but we don't use it for anything. We could look to clean it up though
700
+ if not task.is_cloned:
701
+ _, foreach_stack = task.finished_id
702
+ top = foreach_stack[-1]
703
+ bottom = list(foreach_stack[:-1])
704
+ for i in range(num_splits):
705
+ s = tuple(bottom + [top._replace(index=i)])
706
+ self._finished[(task.step, s)] = mapper_tasks[i]
707
+ self._is_cloned[mapper_tasks[i]] = False
652
708
 
653
709
  # Find and check status of control task and retrieve its pathspec
654
710
  # for retrieving unbounded foreach cardinality.
@@ -1080,7 +1136,7 @@ class Task(object):
1080
1136
  # To avoid the edge case where the resume leader is selected but has not
1081
1137
  # yet written the _resume_leader metadata, we will wait for a few seconds.
1082
1138
  # We will wait for resume leader for at most 3 times.
1083
- for resume_leader_wait_retry in range(3):
1139
+ for _ in range(3):
1084
1140
  if ds.has_metadata("_resume_leader", add_attempt=False):
1085
1141
  resume_leader = ds.load_metadata(
1086
1142
  ["_resume_leader"], add_attempt=False
metaflow/version.py CHANGED
@@ -1 +1 @@
1
- metaflow_version = "2.12.34"
1
+ metaflow_version = "2.12.36"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: metaflow
3
- Version: 2.12.34
3
+ Version: 2.12.36
4
4
  Summary: Metaflow: More Data Science, Less Engineering
5
5
  Author: Metaflow Developers
6
6
  Author-email: help@metaflow.org
@@ -26,7 +26,7 @@ License-File: LICENSE
26
26
  Requires-Dist: requests
27
27
  Requires-Dist: boto3
28
28
  Provides-Extra: stubs
29
- Requires-Dist: metaflow-stubs==2.12.34; extra == "stubs"
29
+ Requires-Dist: metaflow-stubs==2.12.36; extra == "stubs"
30
30
 
31
31
  ![Metaflow_Logo_Horizontal_FullColor_Ribbon_Dark_RGB](https://user-images.githubusercontent.com/763451/89453116-96a57e00-d713-11ea-9fa6-82b29d4d6eff.png)
32
32
 
@@ -16,7 +16,7 @@ metaflow/includefile.py,sha256=rDJnxF0U7vD3cz9hhPkKlW_KS3ToaXnlOjhjNZ__Rx4,19628
16
16
  metaflow/info_file.py,sha256=wtf2_F0M6dgiUu74AFImM8lfy5RrUw5Yj7Rgs2swKRY,686
17
17
  metaflow/integrations.py,sha256=LlsaoePRg03DjENnmLxZDYto3NwWc9z_PtU6nJxLldg,1480
18
18
  metaflow/lint.py,sha256=5rj1MlpluxyPTSINjtMoJ7viotyNzfjtBJSAihlAwMU,10870
19
- metaflow/metaflow_config.py,sha256=gsE6LxvZUc68HXw3Uf7Olv8Oaq5AY17nwhKvF0oXF_8,22950
19
+ metaflow/metaflow_config.py,sha256=bA6myygTf5WAEhtPiCUPQIYKP136Ozieg7mTHLkTfps,23039
20
20
  metaflow/metaflow_config_funcs.py,sha256=5GlvoafV6SxykwfL8D12WXSfwjBN_NsyuKE_Q3gjGVE,6738
21
21
  metaflow/metaflow_current.py,sha256=pfkXmkyHeMJhxIs6HBJNBEaBDpcl5kz9Wx5mW6F_3qo,7164
22
22
  metaflow/metaflow_environment.py,sha256=rojFyGdyY56sN1HaEb1-0XX53Q3XPNnl0SaH-8xXZ8w,7987
@@ -29,14 +29,14 @@ metaflow/parameters.py,sha256=pzjG0ssuVHPyYQqWE86dS3yYChEqbT90rOUcRD4wNog,16079
29
29
  metaflow/procpoll.py,sha256=U2tE4iK_Mwj2WDyVTx_Uglh6xZ-jixQOo4wrM9OOhxg,2859
30
30
  metaflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  metaflow/pylint_wrapper.py,sha256=zzBY9YaSUZOGH-ypDKAv2B_7XcoyMZj-zCoCrmYqNRc,2865
32
- metaflow/runtime.py,sha256=RXA8-YOeGnBl7IJtoMIl4T1zEOmvFsTKv5rBxnnnDEY,68884
32
+ metaflow/runtime.py,sha256=wWoJILHG2JGEPnQqNI07vHT6bfJuu-pxcu5kH07787M,71841
33
33
  metaflow/tagging_util.py,sha256=ctyf0Q1gBi0RyZX6J0e9DQGNkNHblV_CITfy66axXB4,2346
34
34
  metaflow/task.py,sha256=xVVLWy8NH16OlLu2VoOb1OfiFzcOVVCdQldlmb1Zb_w,29691
35
35
  metaflow/tuple_util.py,sha256=_G5YIEhuugwJ_f6rrZoelMFak3DqAR2tt_5CapS1XTY,830
36
36
  metaflow/unbounded_foreach.py,sha256=p184WMbrMJ3xKYHwewj27ZhRUsSj_kw1jlye5gA9xJk,387
37
37
  metaflow/util.py,sha256=w7oylILPaNAjtM8MR8dfUazTVBArV_CKPpqGs4HnowM,13785
38
38
  metaflow/vendor.py,sha256=FchtA9tH22JM-eEtJ2c9FpUdMn8sSb1VHuQS56EcdZk,5139
39
- metaflow/version.py,sha256=1QFMy_kk8f34f8BuD9G4-ZA0vcxYQo1O9WhpDuY8MXQ,29
39
+ metaflow/version.py,sha256=DYRO3aeKT1fdQdfo72nEW7XP8dt5jMsZSc1x4ewMnmU,29
40
40
  metaflow/_vendor/__init__.py,sha256=y_CiwUD3l4eAKvTVDZeqgVujMy31cAM1qjAB-HfI-9s,353
41
41
  metaflow/_vendor/typing_extensions.py,sha256=0nUs5p1A_UrZigrAVBoOEM6TxU37zzPDUtiij1ZwpNc,110417
42
42
  metaflow/_vendor/zipp.py,sha256=ajztOH-9I7KA_4wqDYygtHa6xUBVZgFpmZ8FE74HHHI,8425
@@ -151,7 +151,7 @@ metaflow/plugins/environment_decorator.py,sha256=6m9j2B77d-Ja_l_9CTJ__0O6aB2a8Qt
151
151
  metaflow/plugins/events_decorator.py,sha256=8YSapp_sT3UzNrb6cYBJ19-wmX_CKow6OOJN8kNVnpg,26456
152
152
  metaflow/plugins/logs_cli.py,sha256=77W5UNagU2mOKSMMvrQxQmBLRzvmjK-c8dWxd-Ygbqs,11410
153
153
  metaflow/plugins/package_cli.py,sha256=-J6D4cupHfWSZ4GEFo2yy9Je9oL3owRWm5pEJwaiqd4,1649
154
- metaflow/plugins/parallel_decorator.py,sha256=GIjZZVTqkvtnMuGE8RNtObX6CAJavZTxttqRujGmnGs,8973
154
+ metaflow/plugins/parallel_decorator.py,sha256=GR6LKIW7_S7AoU50Ar2_0nndVtO2epdn3LuthE0vKMQ,9127
155
155
  metaflow/plugins/project_decorator.py,sha256=eJOe0Ea7CbUCReEhR_XQvRkhV6jyRqDxM72oZI7EMCk,5336
156
156
  metaflow/plugins/resources_decorator.py,sha256=AtoOwg4mHYHYthg-CAfbfam-QiT0ViuDLDoukoDvF6Q,1347
157
157
  metaflow/plugins/retry_decorator.py,sha256=tz_2Tq6GLg3vjDBZp0KKVTk3ADlCvqaWTSf7blmFdUw,1548
@@ -160,7 +160,7 @@ metaflow/plugins/tag_cli.py,sha256=10039-0DUF0cmhudoDNrRGLWq8tCGQJ7tBsQAGAmkBQ,1
160
160
  metaflow/plugins/test_unbounded_foreach_decorator.py,sha256=33p5aCWnyk9MT5DmXcm4Q_Qnwfd4y4xvVTEfeqs4by0,5957
161
161
  metaflow/plugins/timeout_decorator.py,sha256=R-X8rKeMqd-xhfJFqskWb6ZpmZt2JB14U1BZJSRriwM,3648
162
162
  metaflow/plugins/airflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
- metaflow/plugins/airflow/airflow.py,sha256=V9a96L5Cb1Yg5CvcbX6WDa7J8lQh5AjIvnVQod20_7k,32160
163
+ metaflow/plugins/airflow/airflow.py,sha256=GDaKLdzzySttJfhl_OiYjkK_ubIaRKR4YgcLaKRCQLk,32293
164
164
  metaflow/plugins/airflow/airflow_cli.py,sha256=2JzrGUY9mPNyXRO6fJbgdtiOZXsEUv4Fn2MSPcoPszU,14692
165
165
  metaflow/plugins/airflow/airflow_decorator.py,sha256=IWT6M9gga8t65FR4Wi7pIZvOupk3hE75B5NRg9tMEps,1781
166
166
  metaflow/plugins/airflow/airflow_utils.py,sha256=dvRllfQeOWfDUseFnOocIGaL3gRI_A7cEHnC1w01vfk,28905
@@ -175,7 +175,7 @@ metaflow/plugins/airflow/sensors/s3_sensor.py,sha256=iDReG-7FKnumrtQg-HY6cCUAAqN
175
175
  metaflow/plugins/argo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
176
  metaflow/plugins/argo/argo_client.py,sha256=Z_A1TO9yw4Y-a8VAlwrFS0BwunWzXpbtik-j_xjcuHE,16303
177
177
  metaflow/plugins/argo/argo_events.py,sha256=_C1KWztVqgi3zuH57pInaE9OzABc2NnncC-zdwOMZ-w,5909
178
- metaflow/plugins/argo/argo_workflows.py,sha256=d-d_xvjUeb5t11kaHZ98yIrGAqYg__FhxNcQTY8LVoA,173881
178
+ metaflow/plugins/argo/argo_workflows.py,sha256=G1yu-bBGpOyoZKRlgKdJLxNueRCv25sPuDM_gbQkBJw,174184
179
179
  metaflow/plugins/argo/argo_workflows_cli.py,sha256=NdLwzfBcTsR72qLycZBesR4Pwv48o3Z_v6OfYrZuVEY,36721
180
180
  metaflow/plugins/argo/argo_workflows_decorator.py,sha256=QdM1rK9gM-lDhyZldK8WqvFqJDvfJ7i3JPR5Uzaq2as,7887
181
181
  metaflow/plugins/argo/argo_workflows_deployer.py,sha256=6kHxEnYXJwzNCM9swI8-0AckxtPWqwhZLerYkX8fxUM,4444
@@ -281,13 +281,13 @@ metaflow/plugins/gcp/gs_tail.py,sha256=qz0QZKT-5LvL8qgZZK2yyMOwuEnx1YOz-pTSAUmwv
281
281
  metaflow/plugins/gcp/gs_utils.py,sha256=ZmIGFse1qYyvAVrwga23PQUzF6dXEDLLsZ2F-YRmvow,2030
282
282
  metaflow/plugins/gcp/includefile_support.py,sha256=OQO0IVWv4ObboL0VqEZwcDOyj9ORLdur66JToxQ84vU,3887
283
283
  metaflow/plugins/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
284
- metaflow/plugins/kubernetes/kube_utils.py,sha256=fYDlvqi8jYPsWijDwT6Z2qhQswyFqv7tiwtic_I80Vg,749
285
- metaflow/plugins/kubernetes/kubernetes.py,sha256=_cq_4N8l40cP0kifYMDAL8Te0DnIKUhVCqee3covwdM,31642
286
- metaflow/plugins/kubernetes/kubernetes_cli.py,sha256=TAYOKTQegYxex5piasLc53kNQPLzxuO8FzHXxkdjPjY,13472
284
+ metaflow/plugins/kubernetes/kube_utils.py,sha256=CbJRMn-sQyGqG-hKMBBjA6xmw15_DyQmhU8TxNyWqcQ,2124
285
+ metaflow/plugins/kubernetes/kubernetes.py,sha256=FrIL2wRUzy4bJr6pNz3I-tNFH-OJWHJcrarJsBKRPLE,31728
286
+ metaflow/plugins/kubernetes/kubernetes_cli.py,sha256=tvxwgBojuiezIUdum95t9fHzDs50q-a-gkPyIApWnCY,13633
287
287
  metaflow/plugins/kubernetes/kubernetes_client.py,sha256=tuvXP-QKpdeSmzVolB2R_TaacOr5DIb0j642eKcjsiM,6491
288
- metaflow/plugins/kubernetes/kubernetes_decorator.py,sha256=Gq3lGKA8SPh3pHDbP_FCkUQPMRrIxvbcmw6Jly5PhEY,27846
289
- metaflow/plugins/kubernetes/kubernetes_job.py,sha256=Cfkee8LbXC17jSXWoeNdomQRvF_8YSeXNg1gvxm6E_M,31806
290
- metaflow/plugins/kubernetes/kubernetes_jobsets.py,sha256=iehUEKv2KogyJKnp5jejdGP8R-TtF2aX9Wx1WpjKLvM,42030
288
+ metaflow/plugins/kubernetes/kubernetes_decorator.py,sha256=wHwm5pQvhjI9Dcj6yBdKZGprFGrSEgd88-TZew6vgNs,28569
289
+ metaflow/plugins/kubernetes/kubernetes_job.py,sha256=CoDzG0eEcJezfMTmgYJ4Ea9G_o5INYm0w1DvjGwJT2A,31916
290
+ metaflow/plugins/kubernetes/kubernetes_jobsets.py,sha256=0SGOfStlh6orXVpF3s6Mu26OFR36eh4dj-sFYwg3HaA,42066
291
291
  metaflow/plugins/metadata_providers/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
292
292
  metaflow/plugins/metadata_providers/local.py,sha256=9UAxe9caN6kU1lkSlIoJbRGgTqsMa62cBTnyMwhqiaA,22446
293
293
  metaflow/plugins/metadata_providers/service.py,sha256=NKZfFMamx6upP6aFRJfXlfYIhySgFNzz6kbp1yPD7LA,20222
@@ -348,9 +348,9 @@ metaflow/tutorials/07-worldview/README.md,sha256=5vQTrFqulJ7rWN6r20dhot9lI2sVj9W
348
348
  metaflow/tutorials/07-worldview/worldview.ipynb,sha256=ztPZPI9BXxvW1QdS2Tfe7LBuVzvFvv0AToDnsDJhLdE,2237
349
349
  metaflow/tutorials/08-autopilot/README.md,sha256=GnePFp_q76jPs991lMUqfIIh5zSorIeWznyiUxzeUVE,1039
350
350
  metaflow/tutorials/08-autopilot/autopilot.ipynb,sha256=DQoJlILV7Mq9vfPBGW-QV_kNhWPjS5n6SJLqePjFYLY,3191
351
- metaflow-2.12.34.dist-info/LICENSE,sha256=nl_Lt5v9VvJ-5lWJDT4ddKAG-VZ-2IaLmbzpgYDz2hU,11343
352
- metaflow-2.12.34.dist-info/METADATA,sha256=nyJPMX2oVYbAC9lY9KuNPMngDomft-j5DJoeHGAnJuY,5907
353
- metaflow-2.12.34.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
354
- metaflow-2.12.34.dist-info/entry_points.txt,sha256=IKwTN1T3I5eJL3uo_vnkyxVffcgnRdFbKwlghZfn27k,57
355
- metaflow-2.12.34.dist-info/top_level.txt,sha256=v1pDHoWaSaKeuc5fKTRSfsXCKSdW1zvNVmvA-i0if3o,9
356
- metaflow-2.12.34.dist-info/RECORD,,
351
+ metaflow-2.12.36.dist-info/LICENSE,sha256=nl_Lt5v9VvJ-5lWJDT4ddKAG-VZ-2IaLmbzpgYDz2hU,11343
352
+ metaflow-2.12.36.dist-info/METADATA,sha256=QSuhOl0_WqTjlMSRhMgAt6sxTsOzHzEyA1F5fzwXe_w,5907
353
+ metaflow-2.12.36.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
354
+ metaflow-2.12.36.dist-info/entry_points.txt,sha256=IKwTN1T3I5eJL3uo_vnkyxVffcgnRdFbKwlghZfn27k,57
355
+ metaflow-2.12.36.dist-info/top_level.txt,sha256=v1pDHoWaSaKeuc5fKTRSfsXCKSdW1zvNVmvA-i0if3o,9
356
+ metaflow-2.12.36.dist-info/RECORD,,