PyPI - metaflow - Versions diffs - 2.12.7__py2.py3-none-any.whl → 2.12.9__py2.py3-none-any.whl - Mend

metaflow 2.12.7py2.py3-none-any.whl → 2.12.9py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

metaflow/__init__.py +2 -0
metaflow/cli.py +12 -4
metaflow/extension_support/plugins.py +1 -0
metaflow/flowspec.py +8 -1
metaflow/lint.py +13 -0
metaflow/metaflow_current.py +0 -8
metaflow/plugins/__init__.py +12 -0
metaflow/plugins/argo/argo_workflows.py +462 -42
metaflow/plugins/argo/argo_workflows_cli.py +60 -3
metaflow/plugins/argo/argo_workflows_decorator.py +38 -7
metaflow/plugins/argo/argo_workflows_deployer.py +290 -0
metaflow/plugins/argo/jobset_input_paths.py +16 -0
metaflow/plugins/aws/batch/batch_decorator.py +16 -13
metaflow/plugins/aws/step_functions/step_functions_cli.py +45 -3
metaflow/plugins/aws/step_functions/step_functions_deployer.py +251 -0
metaflow/plugins/cards/card_cli.py +1 -1
metaflow/plugins/kubernetes/kubernetes.py +279 -52
metaflow/plugins/kubernetes/kubernetes_cli.py +26 -8
metaflow/plugins/kubernetes/kubernetes_client.py +0 -1
metaflow/plugins/kubernetes/kubernetes_decorator.py +56 -44
metaflow/plugins/kubernetes/kubernetes_job.py +6 -6
metaflow/plugins/kubernetes/kubernetes_jobsets.py +510 -272
metaflow/plugins/parallel_decorator.py +108 -8
metaflow/plugins/pypi/bootstrap.py +1 -1
metaflow/plugins/pypi/micromamba.py +1 -1
metaflow/plugins/secrets/secrets_decorator.py +12 -3
metaflow/plugins/test_unbounded_foreach_decorator.py +39 -4
metaflow/runner/deployer.py +386 -0
metaflow/runner/metaflow_runner.py +1 -20
metaflow/runner/nbdeploy.py +130 -0
metaflow/runner/nbrun.py +4 -28
metaflow/runner/utils.py +49 -0
metaflow/runtime.py +246 -134
metaflow/version.py +1 -1
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/METADATA +2 -2
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/RECORD +40 -34
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/WHEEL +1 -1
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/LICENSE +0 -0
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/entry_points.txt +0 -0
{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/top_level.txt +0 -0

metaflow/runtime.py CHANGED Viewed

@@ -16,6 +16,7 @@ from functools import partial
 from concurrent import futures
 from metaflow.datastore.exceptions import DataException
+from contextlib import contextmanager
 from . import get_namespace
 from .metadata import MetaDatum
@@ -109,6 +110,8 @@ class NativeRuntime(object):
         self._clone_run_id = clone_run_id
         self._clone_only = clone_only
         self._clone_steps = {} if clone_steps is None else clone_steps
+        self._cloned_tasks = []
+        self._cloned_task_index = set()
         self._reentrant = reentrant
         self._run_url = None
@@ -203,6 +206,22 @@ class NativeRuntime(object):
         self._is_cloned[self._params_task.path] = self._params_task.is_cloned
+    def should_skip_clone_only_execution(self):
+        (
+            should_skip_clone_only_execution,
+            skip_reason,
+        ) = self._should_skip_clone_only_execution()
+        if should_skip_clone_only_execution:
+            self._logger(skip_reason, system_msg=True)
+            return True
+        return False
+    @contextmanager
+    def run_heartbeat(self):
+        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
+        yield
+        self._metadata.stop_heartbeat()
     def print_workflow_info(self):
         self._run_url = (
             "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
@@ -235,41 +254,58 @@ class NativeRuntime(object):
                 )
         return False, None
-    def clone_task(self, step_name, task_id):
-        self._logger(
-            "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
+    def clone_task(
+        self,
+        step_name,
+        task_id,
+        pathspec_index,
+        ubf_context,
+        generate_task_obj,
+        verbose=False,
+    ):
+        try:
+            new_task_id = task_id
+            if generate_task_obj:
+                task = self._new_task(step_name, pathspec_index=pathspec_index)
+                if ubf_context:
+                    task.ubf_context = ubf_context
+                new_task_id = task.task_id
+                self._cloned_tasks.append(task)
+                self._cloned_task_index.add(task.task_index)
+            if verbose:
+                self._logger(
+                    "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
+                        self._flow.name,
+                        self._clone_run_id,
+                        step_name,
+                        task_id,
+                        self._flow.name,
+                        self._run_id,
+                        step_name,
+                        new_task_id,
+                    ),
+                    system_msg=True,
+                )
+            clone_task_helper(
                 self._flow.name,
                 self._clone_run_id,
-                step_name,
-                task_id,
-                self._flow.name,
                 self._run_id,
                 step_name,
-                task_id,
-            ),
-            system_msg=True,
-        )
-        clone_task_helper(
-            self._flow.name,
-            self._clone_run_id,
-            self._run_id,
-            step_name,
-            task_id,  # origin_task_id
-            task_id,
-            self._flow_datastore,
-            self._metadata,
-            origin_ds_set=self._origin_ds_set,
-        )
+                task_id,  # origin_task_id
+                new_task_id,
+                self._flow_datastore,
+                self._metadata,
+                origin_ds_set=self._origin_ds_set,
+            )
+        except Exception as e:
+            self._logger(
+                "Cloning task from {}/{}/{} failed with error: {}".format(
+                    self._clone_run_id, step_name, task_id, str(e)
+                )
+            )
-    def clone_original_run(self):
-        (
-            should_skip_clone_only_execution,
-            skip_reason,
-        ) = self._should_skip_clone_only_execution()
-        if should_skip_clone_only_execution:
-            self._logger(skip_reason, system_msg=True)
-            return
-        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
+    def clone_original_run(self, generate_task_obj=False, verbose=True):
         self._logger(
             "Start cloning original run: {}/{}".format(
                 self._flow.name, self._clone_run_id
@@ -279,43 +315,106 @@ class NativeRuntime(object):
         inputs = []
+        ubf_mapper_tasks_to_clone = []
+        # We only clone ubf mapper tasks if the control task is complete.
+        # Here we need to check which control tasks are complete, and then get the corresponding
+        # mapper tasks.
         for task_ds in self._origin_ds_set:
             _, step_name, task_id = task_ds.pathspec.split("/")
+            pathspec_index = task_ds.pathspec_index
+            if task_ds["_task_ok"] and step_name != "_parameters":
+                # Only control task can have _control_mapper_tasks. We then store the corresponding mapepr task pathspecs.
+                control_mapper_tasks = (
+                    []
+                    if "_control_mapper_tasks" not in task_ds
+                    else task_ds["_control_mapper_tasks"]
+                )
+                ubf_mapper_tasks_to_clone.extend(control_mapper_tasks)
+        for task_ds in self._origin_ds_set:
+            _, step_name, task_id = task_ds.pathspec.split("/")
+            pathspec_index = task_ds.pathspec_index
             if task_ds["_task_ok"] and step_name != "_parameters":
-                inputs.append((step_name, task_id))
+                # "_unbounded_foreach" is a special flag to indicate that the transition is an unbounded foreach.
+                # Both parent and splitted children tasks will have this flag set. The splitted control/mapper tasks
+                # have no "foreach_param" because UBF is always followed by a join step.
+                is_ubf_task = (
+                    "_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
+                ) and (self._graph[step_name].foreach_param is None)
+                # Only the control task has "_control_mapper_tasks" artifact.
+                is_ubf_control_task = (
+                    is_ubf_task
+                    and ("_control_mapper_tasks" in task_ds)
+                    and task_ds["_control_mapper_tasks"]
+                )
+                is_ubf_mapper_tasks = is_ubf_task and (not is_ubf_control_task)
+                if is_ubf_mapper_tasks and (
+                    task_ds.pathspec not in ubf_mapper_tasks_to_clone
+                ):
+                    # Skip copying UBF mapper tasks if control tasks is incomplete.
+                    continue
+                ubf_context = None
+                if is_ubf_task:
+                    ubf_context = "ubf_test" if is_ubf_mapper_tasks else "ubf_control"
+                inputs.append(
+                    (
+                        step_name,
+                        task_id,
+                        pathspec_index,
+                        is_ubf_mapper_tasks,
+                        ubf_context,
+                    )
+                )
         with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
             all_tasks = [
-                executor.submit(self.clone_task, step_name, task_id)
-                for (step_name, task_id) in inputs
+                executor.submit(
+                    self.clone_task,
+                    step_name,
+                    task_id,
+                    pathspec_index,
+                    ubf_context=ubf_context,
+                    generate_task_obj=generate_task_obj and (not is_ubf_mapper_tasks),
+                    verbose=verbose,
+                )
+                for (
+                    step_name,
+                    task_id,
+                    pathspec_index,
+                    is_ubf_mapper_tasks,
+                    ubf_context,
+                ) in inputs
             ]
             _, _ = futures.wait(all_tasks)
         self._logger("Cloning original run is done", system_msg=True)
         self._params_task.mark_resume_done()
-        self._metadata.stop_heartbeat()
     def execute(self):
-        (
-            should_skip_clone_only_execution,
-            skip_reason,
-        ) = self._should_skip_clone_only_execution()
-        if should_skip_clone_only_execution:
-            self._logger(skip_reason, system_msg=True)
-            return
-        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
-        if self._params_task:
-            self._queue_push("start", {"input_paths": [self._params_task.path]})
+        if len(self._cloned_tasks) > 0:
+            # mutable list storing the cloned tasks.
+            self._run_queue = []
+            self._active_tasks[0] = 0
         else:
-            self._queue_push("start", {})
+            if self._params_task:
+                self._queue_push("start", {"input_paths": [self._params_task.path]})
+            else:
+                self._queue_push("start", {})
         progress_tstamp = time.time()
         try:
             # main scheduling loop
             exception = None
-            while self._run_queue or self._active_tasks[0] > 0:
+            while self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks:
                 # 1. are any of the current workers finished?
-                finished_tasks = list(self._poll_workers())
+                if self._cloned_tasks:
+                    finished_tasks = self._cloned_tasks
+                    # reset the list of cloned tasks and let poll_workers handle
+                    # the remaining transition
+                    self._cloned_tasks = []
+                else:
+                    finished_tasks = list(self._poll_workers())
                 # 2. push new tasks triggered by the finished tasks to the queue
                 self._queue_tasks(finished_tasks)
                 # 3. if there are available worker slots, pop and start tasks
@@ -381,8 +480,6 @@ class NativeRuntime(object):
                 for deco in step.decorators:
                     deco.runtime_finished(exception)
-            self._metadata.stop_heartbeat()
         # assert that end was executed and it was successful
         if ("end", ()) in self._finished:
             if self._run_url:
@@ -432,9 +529,41 @@ class NativeRuntime(object):
         for _ in range(3):
             list(self._poll_workers())
+    # Given the current task information (task_index), the type of transition,
+    # and the split index, return the new task index.
+    def _translate_index(self, task, next_step, type, split_index=None):
+        import re
+        match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
+        if match:
+            _, foreach_index = match.groups()
+            # Convert foreach_index to a list of integers
+            if len(foreach_index) > 0:
+                foreach_index = foreach_index.split(",")
+            else:
+                foreach_index = []
+        else:
+            raise ValueError(
+                "Index not in the format of {run_id}/{step_name}[{foreach_index}]"
+            )
+        if type == "linear":
+            return "%s[%s]" % (next_step, ",".join(foreach_index))
+        elif type == "join":
+            indices = []
+            if len(foreach_index) > 0:
+                indices = foreach_index[:-1]
+            return "%s[%s]" % (next_step, ",".join(indices))
+        elif type == "split":
+            foreach_index.append(str(split_index))
+            return "%s[%s]" % (next_step, ",".join(foreach_index))
     # Store the parameters needed for task creation, so that pushing on items
     # onto the run_queue is an inexpensive operation.
-    def _queue_push(self, step, task_kwargs):
+    def _queue_push(self, step, task_kwargs, index=None):
+        # If the to-be-pushed task is already cloned before, we don't need
+        # to re-run it.
+        if index and index in self._cloned_task_index:
+            return
         self._run_queue.insert(0, (step, task_kwargs))
         # For foreaches, this will happen multiple time but is ok, becomes a no-op
         self._unprocessed_steps.discard(step)
@@ -493,30 +622,19 @@ class NativeRuntime(object):
                     )
                 num_splits = len(mapper_tasks)
                 self._control_num_splits[task.path] = num_splits
-                if task.is_cloned:
-                    # Add mapper tasks to be cloned.
-                    for i in range(num_splits):
-                        # NOTE: For improved robustness, introduce
-                        # `clone_options` as an enum so that we can force that
-                        # clone must occur for this task.
-                        self._queue_push(
-                            task.step,
-                            {
-                                "input_paths": task.input_paths,
-                                "split_index": str(i),
-                                "ubf_context": UBF_TASK,
-                            },
-                        )
-                else:
-                    # Update _finished since these tasks were successfully
-                    # run elsewhere so that join will be unblocked.
-                    _, foreach_stack = task.finished_id
-                    top = foreach_stack[-1]
-                    bottom = list(foreach_stack[:-1])
-                    for i in range(num_splits):
-                        s = tuple(bottom + [top._replace(index=i)])
-                        self._finished[(task.step, s)] = mapper_tasks[i]
-                        self._is_cloned[mapper_tasks[i]] = False
+                # If the control task is cloned, all mapper tasks should have been cloned
+                # as well, so we no longer need to handle cloning of mapper tasks in runtime.
+                # Update _finished since these tasks were successfully
+                # run elsewhere so that join will be unblocked.
+                _, foreach_stack = task.finished_id
+                top = foreach_stack[-1]
+                bottom = list(foreach_stack[:-1])
+                for i in range(num_splits):
+                    s = tuple(bottom + [top._replace(index=i)])
+                    self._finished[(task.step, s)] = mapper_tasks[i]
+                    self._is_cloned[mapper_tasks[i]] = False
             # Find and check status of control task and retrieve its pathspec
             # for retrieving unbounded foreach cardinality.
@@ -541,16 +659,18 @@ class NativeRuntime(object):
                     required_tasks.append(self._finished.get((task.step, s)))
                 if all(required_tasks):
+                    index = self._translate_index(task, next_step, "join")
                     # all tasks to be joined are ready. Schedule the next join step.
                     self._queue_push(
                         next_step,
                         {"input_paths": required_tasks, "join_type": "foreach"},
+                        index,
                     )
         else:
             # matching_split is the split-parent of the finished task
             matching_split = self._graph[self._graph[next_step].split_parents[-1]]
             _, foreach_stack = task.finished_id
+            index = ""
             if matching_split.type == "foreach":
                 # next step is a foreach join
@@ -565,6 +685,7 @@ class NativeRuntime(object):
                     self._finished.get((task.step, s)) for s in siblings(foreach_stack)
                 ]
                 join_type = "foreach"
+                index = self._translate_index(task, next_step, "join")
             else:
                 # next step is a split
                 # required tasks are all branches joined by the next step
@@ -573,11 +694,14 @@ class NativeRuntime(object):
                     for step in self._graph[next_step].in_funcs
                 ]
                 join_type = "linear"
+                index = self._translate_index(task, next_step, "linear")
             if all(required_tasks):
                 # all tasks to be joined are ready. Schedule the next join step.
                 self._queue_push(
-                    next_step, {"input_paths": required_tasks, "join_type": join_type}
+                    next_step,
+                    {"input_paths": required_tasks, "join_type": join_type},
+                    index,
                 )
     def _queue_task_foreach(self, task, next_steps):
@@ -598,6 +722,12 @@ class NativeRuntime(object):
             # Need to push control process related task.
             ubf_iter_name = task.results.get("_foreach_var")
             ubf_iter = task.results.get(ubf_iter_name)
+            # UBF control task has no split index, hence "None" as place holder.
+            if task.results.get("_control_task_is_mapper_zero", False):
+                index = self._translate_index(task, next_step, "split", 0)
+            else:
+                index = self._translate_index(task, next_step, "split", None)
             self._queue_push(
                 next_step,
                 {
@@ -605,6 +735,7 @@ class NativeRuntime(object):
                     "ubf_context": UBF_CONTROL,
                     "ubf_iter": ubf_iter,
                 },
+                index,
             )
         else:
             num_splits = task.results["_foreach_num_splits"]
@@ -624,8 +755,11 @@ class NativeRuntime(object):
             # schedule all splits
             for i in range(num_splits):
+                index = self._translate_index(task, next_step, "split", i)
                 self._queue_push(
-                    next_step, {"split_index": str(i), "input_paths": [task.path]}
+                    next_step,
+                    {"split_index": str(i), "input_paths": [task.path]},
+                    index,
                 )
     def _queue_tasks(self, finished_tasks):
@@ -673,7 +807,8 @@ class NativeRuntime(object):
             else:
                 # Next steps are normal linear steps
                 for step in next_steps:
-                    self._queue_push(step, {"input_paths": [task.path]})
+                    index = self._translate_index(task, step, "linear")
+                    self._queue_push(step, {"input_paths": [task.path]}, index)
     def _poll_workers(self):
         if self._workers:
@@ -794,6 +929,7 @@ class Task(object):
         join_type=None,
         task_id=None,
         resume_identifier=None,
+        pathspec_index=None,
     ):
         self.step = step
         self.flow = flow
@@ -836,10 +972,9 @@ class Task(object):
         self._is_resume_leader = None
         self._resume_done = None
         self._resume_identifier = resume_identifier
         origin = None
         if clone_run_id and may_clone:
-            origin = self._find_origin_task(clone_run_id, join_type)
+            origin = self._find_origin_task(clone_run_id, join_type, pathspec_index)
         if origin and origin["_task_ok"]:
             # At this point, we know we are going to clone
             self._is_cloned = True
@@ -960,10 +1095,11 @@ class Task(object):
                         )
                 if self._is_resume_leader:
-                    self.log(
-                        "Selected as the reentrant clone leader.",
-                        system_msg=True,
-                    )
+                    if reentrant:
+                        self.log(
+                            "Selected as the reentrant clone leader.",
+                            system_msg=True,
+                        )
                     # Clone in place without relying on run_queue.
                     self.new_attempt()
                     self._ds.clone(origin)
@@ -1108,63 +1244,34 @@ class Task(object):
     def _get_task_id(self, task_id):
         already_existed = True
+        tags = []
         if self.ubf_context == UBF_CONTROL:
-            [input_path] = self.input_paths
-            run, input_step, input_task = input_path.split("/")
-            # We associate the control task-id to be 1:1 with the split node
-            # where the unbounded-foreach was defined.
-            # We prefer encoding the corresponding split into the task_id of
-            # the control node; so it has access to this information quite
-            # easily. There is anyway a corresponding int id stored in the
-            # metadata backend - so this should be fine.
-            task_id = "control-%s-%s-%s" % (run, input_step, input_task)
-        # Register only regular Metaflow (non control) tasks.
+            tags = [CONTROL_TASK_TAG]
+        # Register Metaflow tasks.
         if task_id is None:
-            task_id = str(self.metadata.new_task_id(self.run_id, self.step))
+            task_id = str(
+                self.metadata.new_task_id(self.run_id, self.step, sys_tags=tags)
+            )
             already_existed = False
         else:
-            # task_id is preset only by persist_constants() or control tasks.
-            if self.ubf_context == UBF_CONTROL:
-                tags = [CONTROL_TASK_TAG]
-                attempt_id = 0
-                already_existed = not self.metadata.register_task_id(
-                    self.run_id,
-                    self.step,
-                    task_id,
-                    attempt_id,
-                    sys_tags=tags,
-                )
-                # A Task's tags are now those of its ancestral Run, so we are not able
-                # to rely on a task's tags to indicate the presence of a control task
-                # so, on top of adding the tags above, we also add a task metadata
-                # entry indicating that this is a "control task".
-                #
-                # Here we will also add a task metadata entry to indicate "control task".
-                # Within the metaflow repo, the only dependency of such a "control task"
-                # indicator is in the integration test suite (see Step.control_tasks() in
-                # client API).
-                task_metadata_list = [
-                    MetaDatum(
-                        field="internal_task_type",
-                        value=CONTROL_TASK_TAG,
-                        type="internal_task_type",
-                        tags=["attempt_id:{0}".format(attempt_id)],
-                    )
-                ]
-                self.metadata.register_metadata(
-                    self.run_id, self.step, task_id, task_metadata_list
-                )
-            else:
-                already_existed = not self.metadata.register_task_id(
-                    self.run_id, self.step, task_id, 0
-                )
+            # task_id is preset only by persist_constants().
+            already_existed = not self.metadata.register_task_id(
+                self.run_id,
+                self.step,
+                task_id,
+                0,
+                sys_tags=tags,
+            )
         self.task_id = task_id
         self._path = "%s/%s/%s" % (self.run_id, self.step, self.task_id)
         return already_existed
-    def _find_origin_task(self, clone_run_id, join_type):
-        if self.step == "_parameters":
+    def _find_origin_task(self, clone_run_id, join_type, pathspec_index=None):
+        if pathspec_index:
+            origin = self.origin_ds_set.get_with_pathspec_index(pathspec_index)
+            return origin
+        elif self.step == "_parameters":
             pathspec = "%s/_parameters[]" % clone_run_id
             origin = self.origin_ds_set.get_with_pathspec_index(pathspec)
@@ -1214,6 +1321,11 @@ class Task(object):
             )
             return self._results_ds
+    @property
+    def task_index(self):
+        _, task_index = self.results.pathspec_index.split("/")
+        return task_index
     @property
     def finished_id(self):
         # note: id is not available before the task has finished.

metaflow/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- metaflow_version = "2.12.7"
1	+ metaflow_version = "2.12.9"

{metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: metaflow
-Version: 2.12.7
+Version: 2.12.9
 Summary: Metaflow: More Data Science, Less Engineering
 Author: Metaflow Developers
 Author-email: help@metaflow.org
@@ -26,7 +26,7 @@ License-File: LICENSE
 Requires-Dist: requests
 Requires-Dist: boto3
 Provides-Extra: stubs
-Requires-Dist: metaflow-stubs ==2.12.7 ; extra == 'stubs'
+Requires-Dist: metaflow-stubs ==2.12.9 ; extra == 'stubs'
 ![Metaflow_Logo_Horizontal_FullColor_Ribbon_Dark_RGB](https://user-images.githubusercontent.com/763451/89453116-96a57e00-d713-11ea-9fa6-82b29d4d6eff.png)

metaflow 2.12.7__py2.py3-none-any.whl → 2.12.9__py2.py3-none-any.whl

metaflow 2.12.7py2.py3-none-any.whl → 2.12.9py2.py3-none-any.whl