PyPI - metaflow - Versions diffs - 2.18.12__py2.py3-none-any.whl → 2.19.0__py2.py3-none-any.whl - Mend

metaflow 2.18.12py2.py3-none-any.whl → 2.19.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

metaflow/__init__.py +1 -0
metaflow/cli.py +78 -13
metaflow/cli_components/run_cmds.py +182 -39
metaflow/cli_components/step_cmd.py +160 -4
metaflow/client/__init__.py +1 -0
metaflow/client/core.py +162 -99
metaflow/client/filecache.py +59 -32
metaflow/cmd/code/__init__.py +2 -1
metaflow/datastore/__init__.py +1 -0
metaflow/datastore/content_addressed_store.py +40 -9
metaflow/datastore/datastore_set.py +10 -1
metaflow/datastore/flow_datastore.py +123 -4
metaflow/datastore/spin_datastore.py +91 -0
metaflow/datastore/task_datastore.py +86 -2
metaflow/decorators.py +75 -6
metaflow/extension_support/__init__.py +372 -305
metaflow/flowspec.py +3 -2
metaflow/graph.py +2 -2
metaflow/metaflow_config.py +41 -0
metaflow/metaflow_profile.py +18 -0
metaflow/packaging_sys/utils.py +2 -39
metaflow/packaging_sys/v1.py +63 -16
metaflow/plugins/__init__.py +2 -0
metaflow/plugins/argo/argo_workflows.py +20 -25
metaflow/plugins/argo/param_val.py +19 -0
metaflow/plugins/cards/card_datastore.py +13 -13
metaflow/plugins/cards/card_decorator.py +1 -0
metaflow/plugins/cards/card_modules/basic.py +9 -3
metaflow/plugins/datastores/local_storage.py +12 -6
metaflow/plugins/datastores/spin_storage.py +12 -0
metaflow/plugins/datatools/s3/s3.py +29 -10
metaflow/plugins/datatools/s3/s3op.py +90 -62
metaflow/plugins/metadata_providers/local.py +76 -82
metaflow/plugins/metadata_providers/spin.py +16 -0
metaflow/runner/click_api.py +4 -2
metaflow/runner/metaflow_runner.py +210 -19
metaflow/runtime.py +348 -21
metaflow/task.py +61 -12
metaflow/user_configs/config_parameters.py +2 -4
metaflow/user_decorators/mutable_flow.py +1 -1
metaflow/user_decorators/user_step_decorator.py +10 -1
metaflow/util.py +191 -1
metaflow/version.py +1 -1
{metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Makefile +10 -0
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/METADATA +2 -4
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/RECORD +52 -48
{metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Tiltfile +0 -0
{metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/WHEEL +0 -0
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/entry_points.txt +0 -0
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/licenses/LICENSE +0 -0
{metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/top_level.txt +0 -0

metaflow/client/core.py CHANGED Viewed

@@ -207,6 +207,20 @@ def default_namespace() -> str:
     return get_namespace()
+def inspect_spin(datastore_root: str = "."):
+    """
+    Set metadata provider to spin metadata so that users can inspect spin
+    steps, tasks, and artifacts.
+    Parameters
+    ----------
+    datastore_root : str, default "."
+        The root path to the spin datastore.
+    """
+    metadata_str = f"spin@{datastore_root}"
+    metadata(metadata_str)
 MetaflowArtifacts = NamedTuple
@@ -277,6 +291,7 @@ class MetaflowObject(object):
         self._attempt = attempt
         self._current_namespace = _current_namespace or get_namespace()
         self._namespace_check = _namespace_check
         # If the current namespace is False, we disable checking for namespace for this
         # and all children objects. Not setting namespace_check to False has the consequence
         # of preventing access to children objects after the namespace changes
@@ -1189,149 +1204,197 @@ class Task(MetaflowObject):
     _PARENT_CLASS = "step"
     _CHILD_CLASS = "artifact"
-    def __init__(self, *args, **kwargs):
-        super(Task, self).__init__(*args, **kwargs)
     def _iter_filter(self, x):
         # exclude private data artifacts
         return x.id[0] != "_"
-    def _iter_matching_tasks(self, steps, metadata_key, metadata_pattern):
+    def _get_matching_pathspecs(self, steps, metadata_key, metadata_pattern):
         """
-        Yield tasks from specified steps matching a foreach path pattern.
+        Yield pathspecs of tasks from specified steps that match a given metadata pattern.
         Parameters
         ----------
         steps : List[str]
-            List of step names to search for tasks
-        pattern : str
-            Regex pattern to match foreach-indices metadata
+            List of Step objects to search for tasks.
+        metadata_key : str
+            Metadata key to filter tasks on (e.g., 'foreach-execution-path').
+        metadata_pattern : str
+            Regular expression pattern to match against the metadata value.
-        Returns
-        -------
-        Iterator[Task]
-            Tasks matching the foreach path pattern
+        Yields
+        ------
+        str
+            Pathspec of each task whose metadata value for the specified key matches the pattern.
         """
         flow_id, run_id, _, _ = self.path_components
         for step in steps:
             task_pathspecs = self._metaflow.metadata.filter_tasks_by_metadata(
-                flow_id, run_id, step.id, metadata_key, metadata_pattern
+                flow_id, run_id, step, metadata_key, metadata_pattern
             )
             for task_pathspec in task_pathspecs:
-                yield Task(pathspec=task_pathspec, _namespace_check=False)
+                yield task_pathspec
+    @staticmethod
+    def _get_previous_steps(graph_info, step_name):
+        # Get the parent steps
+        steps = []
+        for node_name, attributes in graph_info["steps"].items():
+            if step_name in attributes["next"]:
+                steps.append(node_name)
+        return steps
     @property
-    def parent_tasks(self) -> Iterator["Task"]:
+    def parent_task_pathspecs(self) -> Iterator[str]:
         """
-        Yields all parent tasks of the current task if one exists.
+        Yields pathspecs of all parent tasks of the current task.
         Yields
         ------
-        Task
-            Parent task of the current task
+        str
+            Pathspec of the parent task of the current task
         """
-        flow_id, run_id, _, _ = self.path_components
+        _, _, step_name, _ = self.path_components
+        metadata_dict = self.metadata_dict
+        graph_info = self["_graph_info"].data
-        steps = list(self.parent.parent_steps)
-        if not steps:
-            return []
-        current_path = self.metadata_dict.get("foreach-execution-path", "")
+        # Get the parent steps
+        steps = self._get_previous_steps(graph_info, step_name)
+        node_type = graph_info["steps"][step_name]["type"]
+        metadata_key = "foreach-execution-path"
+        current_path = metadata_dict.get(metadata_key)
         if len(steps) > 1:
             # Static join - use exact path matching
             pattern = current_path or ".*"
-            yield from self._iter_matching_tasks(
-                steps, "foreach-execution-path", pattern
-            )
-            return
-        # Handle single step case
-        target_task = Step(
-            f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
-        ).task
-        target_path = target_task.metadata_dict.get("foreach-execution-path")
-        if not target_path or not current_path:
-            # (Current task, "A:10") and (Parent task, "")
-            # Pattern: ".*"
-            pattern = ".*"
         else:
-            current_depth = len(current_path.split(","))
-            target_depth = len(target_path.split(","))
-            if current_depth < target_depth:
-                # Foreach join
-                # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
-                # Pattern: "A:10,B:13,.*"
-                pattern = f"{current_path},.*"
-            else:
-                # Foreach split or linear step
-                # Option 1:
-                # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
-                # Option 2:
-                # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
-                # Pattern: "A:10,B:13"
-                pattern = ",".join(current_path.split(",")[:target_depth])
+            if not steps:
+                return  # No parent steps, yield nothing
-        yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
+            if not current_path:
+                # Current task is not part of a foreach
+                # Pattern: ".*"
+                pattern = ".*"
+            else:
+                current_depth = len(current_path.split(","))
+                if node_type == "join":
+                    # Foreach join
+                    # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
+                    # Pattern: "A:10,B:13,.*"
+                    pattern = f"{current_path},.*"
+                else:
+                    # Foreach split or linear step
+                    # Pattern: "A:10,B:13"
+                    parent_step_type = graph_info["steps"][steps[0]]["type"]
+                    target_depth = current_depth
+                    if (
+                        parent_step_type == "split-foreach"
+                        or parent_step_type == "split-parallel"
+                    ) and current_depth == 1:
+                        # (Current task, "A:10") and (Parent task, "")
+                        pattern = ".*"
+                    else:
+                        # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
+                        # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
+                        if (
+                            parent_step_type == "split-foreach"
+                            or parent_step_type == "split-parallel"
+                        ):
+                            target_depth = current_depth - 1
+                        pattern = ",".join(current_path.split(",")[:target_depth])
+        for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
+            yield pathspec
     @property
-    def child_tasks(self) -> Iterator["Task"]:
+    def child_task_pathspecs(self) -> Iterator[str]:
         """
-        Yield all child tasks of the current task if one exists.
+        Yields pathspecs of all child tasks of the current task.
         Yields
         ------
-        Task
-            Child task of the current task
+        str
+            Pathspec of the child task of the current task
         """
-        flow_id, run_id, _, _ = self.path_components
-        steps = list(self.parent.child_steps)
-        if not steps:
-            return []
+        flow_id, run_id, step_name, _ = self.path_components
+        metadata_dict = self.metadata_dict
+        graph_info = self["_graph_info"].data
+        # Get the child steps
+        steps = graph_info["steps"][step_name]["next"]
-        current_path = self.metadata_dict.get("foreach-execution-path", "")
+        node_type = graph_info["steps"][step_name]["type"]
+        metadata_key = "foreach-execution-path"
+        current_path = metadata_dict.get(metadata_key)
         if len(steps) > 1:
             # Static split - use exact path matching
             pattern = current_path or ".*"
-            yield from self._iter_matching_tasks(
-                steps, "foreach-execution-path", pattern
-            )
-            return
-        # Handle single step case
-        target_task = Step(
-            f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
-        ).task
-        target_path = target_task.metadata_dict.get("foreach-execution-path")
-        if not target_path or not current_path:
-            # (Current task, "A:10") and (Child task, "")
-            # Pattern: ".*"
-            pattern = ".*"
         else:
-            current_depth = len(current_path.split(","))
-            target_depth = len(target_path.split(","))
-            if current_depth < target_depth:
-                # Foreach split
-                # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
-                # Pattern: "A:10,B:13,.*"
-                pattern = f"{current_path},.*"
+            if not steps:
+                return  # No child steps, yield nothing
+            if not current_path:
+                # Current task is not part of a foreach
+                # Pattern: ".*"
+                pattern = ".*"
             else:
-                # Foreach join or linear step
-                # Option 1:
-                # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
-                # Option 2:
-                # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
-                # Pattern: "A:10,B:13"
-                pattern = ",".join(current_path.split(",")[:target_depth])
-        yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
+                current_depth = len(current_path.split(","))
+                if node_type == "split-foreach" or node_type == "split-parallel":
+                    # Foreach split
+                    # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
+                    # Pattern: "A:10,B:13,.*"
+                    pattern = f"{current_path},.*"
+                else:
+                    # Foreach join or linear step
+                    # Pattern: "A:10,B:13"
+                    child_step_type = graph_info["steps"][steps[0]]["type"]
+                    # We need to know if the child step is a foreach join or a static join
+                    child_step_prev_steps = self._get_previous_steps(
+                        graph_info, steps[0]
+                    )
+                    if len(child_step_prev_steps) > 1:
+                        child_step_type = "static-join"
+                    target_depth = current_depth
+                    if child_step_type == "join" and current_depth == 1:
+                        # (Current task, "A:10") and (Child task, "")
+                        pattern = ".*"
+                    else:
+                        # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
+                        # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
+                        if child_step_type == "join":
+                            target_depth = current_depth - 1
+                        pattern = ",".join(current_path.split(",")[:target_depth])
+        for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
+            yield pathspec
+    @property
+    def parent_tasks(self) -> Iterator["Task"]:
+        """
+        Yields all parent tasks of the current task if one exists.
+        Yields
+        ------
+        Task
+            Parent task of the current task
+        """
+        parent_task_pathspecs = self.parent_task_pathspecs
+        for pathspec in parent_task_pathspecs:
+            yield Task(pathspec=pathspec, _namespace_check=False)
+    @property
+    def child_tasks(self) -> Iterator["Task"]:
+        """
+        Yields all child tasks of the current task if one exists.
+        Yields
+        ------
+        Task
+            Child task of the current task
+        """
+        for pathspec in self.child_task_pathspecs:
+            yield Task(pathspec=pathspec, _namespace_check=False)
     @property
     def metadata(self) -> List[Metadata]:

metaflow/client/filecache.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import print_function
 from collections import OrderedDict
+import json
 import os
 import sys
 import time
@@ -10,13 +11,14 @@ from urllib.parse import urlparse
 from metaflow.datastore import FlowDataStore
 from metaflow.datastore.content_addressed_store import BlobCache
+from metaflow.datastore.flow_datastore import MetadataCache
 from metaflow.exception import MetaflowException
 from metaflow.metaflow_config import (
     CLIENT_CACHE_PATH,
     CLIENT_CACHE_MAX_SIZE,
     CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT,
-    CLIENT_CACHE_MAX_TASKDATASTORE_COUNT,
 )
+from metaflow.metaflow_profile import from_start
 from metaflow.plugins import DATASTORES
@@ -63,8 +65,8 @@ class FileCache(object):
         # when querying for sizes of artifacts. Once we have queried for the size
         # of one artifact in a TaskDatastore, caching this means that any
         # queries on that same TaskDatastore will be quick (since we already
-        # have all the metadata)
-        self._task_metadata_caches = OrderedDict()
+        # have all the metadata). We keep track of this in a file so it persists
+        # across processes.
     @property
     def cache_dir(self):
@@ -87,7 +89,7 @@ class FileCache(object):
     ):
         ds_cls = self._get_datastore_storage_impl(ds_type)
         ds_root = ds_cls.path_join(*ds_cls.path_split(location)[:-5])
-        cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
+        cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
         token = (
             "%s.cached"
@@ -311,13 +313,13 @@ class FileCache(object):
         self._objects = sorted(objects, reverse=False)
     @staticmethod
-    def _flow_ds_id(ds_type, ds_root, flow_name):
+    def flow_ds_id(ds_type, ds_root, flow_name):
         p = urlparse(ds_root)
         sanitized_root = (p.netloc + p.path).replace("/", "_")
         return ".".join([ds_type, sanitized_root, flow_name])
     @staticmethod
-    def _task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
+    def task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
         p = urlparse(ds_root)
         sanitized_root = (p.netloc + p.path).replace("/", "_")
         return ".".join(
@@ -365,7 +367,7 @@ class FileCache(object):
         return storage_impl[0]
     def _get_flow_datastore(self, ds_type, ds_root, flow_name):
-        cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
+        cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
         cached_flow_datastore = self._store_caches.get(cache_id)
         if cached_flow_datastore:
@@ -380,9 +382,14 @@ class FileCache(object):
                 ds_root=ds_root,
             )
             blob_cache = self._blob_caches.setdefault(
-                cache_id, FileBlobCache(self, cache_id)
+                cache_id,
+                (
+                    FileBlobCache(self, cache_id),
+                    TaskMetadataCache(self, ds_type, ds_root, flow_name),
+                ),
             )
-            cached_flow_datastore.ca_store.set_blob_cache(blob_cache)
+            cached_flow_datastore.ca_store.set_blob_cache(blob_cache[0])
+            cached_flow_datastore.set_metadata_cache(blob_cache[1])
             self._store_caches[cache_id] = cached_flow_datastore
             if len(self._store_caches) > CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT:
                 cache_id_to_remove, _ = self._store_caches.popitem(last=False)
@@ -393,32 +400,52 @@ class FileCache(object):
         self, ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
     ):
         flow_ds = self._get_flow_datastore(ds_type, ds_root, flow_name)
-        cached_metadata = None
-        if attempt is not None:
-            cache_id = self._task_ds_id(
-                ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
+        return flow_ds.get_task_datastore(run_id, step_name, task_id, attempt=attempt)
+class TaskMetadataCache(MetadataCache):
+    def __init__(self, filecache, ds_type, ds_root, flow_name):
+        self._filecache = filecache
+        self._ds_type = ds_type
+        self._ds_root = ds_root
+        self._flow_name = flow_name
+    def _path(self, run_id, step_name, task_id, attempt):
+        if attempt is None:
+            raise MetaflowException(
+                "Attempt number must be specified to use task metadata cache. Raise an issue "
+                "on Metaflow GitHub if you see this message.",
             )
-            cached_metadata = self._task_metadata_caches.get(cache_id)
-            if cached_metadata:
-                od_move_to_end(self._task_metadata_caches, cache_id)
-                return flow_ds.get_task_datastore(
-                    run_id,
-                    step_name,
-                    task_id,
-                    attempt=attempt,
-                    data_metadata=cached_metadata,
-                )
-        # If we are here, we either have attempt=None or nothing in the cache
-        task_ds = flow_ds.get_task_datastore(
-            run_id, step_name, task_id, attempt=attempt
+        cache_id = self._filecache.task_ds_id(
+            self._ds_type,
+            self._ds_root,
+            self._flow_name,
+            run_id,
+            step_name,
+            task_id,
+            attempt,
         )
-        cache_id = self._task_ds_id(
-            ds_type, ds_root, flow_name, run_id, step_name, task_id, task_ds.attempt
+        token = (
+            "%s.cached"
+            % sha1(
+                os.path.join(
+                    run_id, step_name, task_id, str(attempt), "metadata"
+                ).encode("utf-8")
+            ).hexdigest()
+        )
+        return os.path.join(self._filecache.cache_dir, cache_id, token[:2], token)
+    def load_metadata(self, run_id, step_name, task_id, attempt):
+        d = self._filecache.read_file(self._path(run_id, step_name, task_id, attempt))
+        if d:
+            return json.loads(d)
+    def store_metadata(self, run_id, step_name, task_id, attempt, metadata_dict):
+        self._filecache.create_file(
+            self._path(run_id, step_name, task_id, attempt),
+            json.dumps(metadata_dict).encode("utf-8"),
         )
-        self._task_metadata_caches[cache_id] = task_ds.ds_metadata
-        if len(self._task_metadata_caches) > CLIENT_CACHE_MAX_TASKDATASTORE_COUNT:
-            self._task_metadata_caches.popitem(last=False)
-        return task_ds
 class FileBlobCache(BlobCache):

metaflow/cmd/code/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from tempfile import TemporaryDirectory
 from typing import Any, Callable, List, Mapping, Optional, cast
 from metaflow import Run
+from metaflow.util import walk_without_cycles
 from metaflow._vendor import click
 from metaflow.cli import echo_always
@@ -51,7 +52,7 @@ def perform_diff(
         target_dir = os.getcwd()
     diffs = []
-    for dirpath, dirnames, filenames in os.walk(source_dir, followlinks=True):
+    for dirpath, _, filenames in walk_without_cycles(source_dir):
         for fname in filenames:
             # NOTE: the paths below need to be set up carefully
             # for the `patch` command to work. Better not to touch

metaflow/datastore/__init__.py CHANGED Viewed

@@ -2,3 +2,4 @@ from .inputs import Inputs
 from .flow_datastore import FlowDataStore
 from .datastore_set import TaskDataStoreSet
 from .task_datastore import TaskDataStore
+from .spin_datastore import SpinTaskDatastore

metaflow/datastore/content_addressed_store.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ContentAddressedStore(object):
     def set_blob_cache(self, blob_cache):
         self._blob_cache = blob_cache
-    def save_blobs(self, blob_iter, raw=False, len_hint=0):
+    def save_blobs(self, blob_iter, raw=False, len_hint=0, is_transfer=False):
         """
         Saves blobs of data to the datastore
@@ -60,11 +60,16 @@ class ContentAddressedStore(object):
         Parameters
         ----------
-        blob_iter : Iterator over bytes objects to save
-        raw : bool, optional
+        blob_iter : Iterator
+            Iterator over bytes objects to save
+        raw : bool, default False
             Whether to save the bytes directly or process them, by default False
-        len_hint : Hint of the number of blobs that will be produced by the
+        len_hint : int, default 0
+            Hint of the number of blobs that will be produced by the
             iterator, by default 0
+        is_transfer : bool, default False
+            If True, this indicates we are saving blobs directly from the output of another
+            content addressed store's
         Returns
         -------
@@ -76,6 +81,20 @@ class ContentAddressedStore(object):
         def packing_iter():
             for blob in blob_iter:
+                if is_transfer:
+                    key, blob_data, meta = blob
+                    path = self._storage_impl.path_join(self._prefix, key[:2], key)
+                    # Transfer data is always raw/decompressed, so mark it as such
+                    meta_corrected = {"cas_raw": True, "cas_version": 1}
+                    results.append(
+                        self.save_blobs_result(
+                            uri=self._storage_impl.full_uri(path),
+                            key=key,
+                        )
+                    )
+                    yield path, (BytesIO(blob_data), meta_corrected)
+                    continue
                 sha = sha1(blob).hexdigest()
                 path = self._storage_impl.path_join(self._prefix, sha[:2], sha)
                 results.append(
@@ -100,7 +119,7 @@ class ContentAddressedStore(object):
         self._storage_impl.save_bytes(packing_iter(), overwrite=True, len_hint=len_hint)
         return results
-    def load_blobs(self, keys, force_raw=False):
+    def load_blobs(self, keys, force_raw=False, is_transfer=False):
         """
         Mirror function of save_blobs
@@ -111,15 +130,20 @@ class ContentAddressedStore(object):
         ----------
         keys : List of string
             Key describing the object to load
-        force_raw : bool, optional
+        force_raw : bool, default False
             Support for backward compatibility with previous datastores. If
             True, this will force the key to be loaded as is (raw). By default,
             False
+        is_transfer : bool, default False
+            If True, this indicates we are loading blobs to transfer them directly
+            to another datastore. We will, in this case, also transfer the metadata
+            and do minimal processing. This is for internal use only.
         Returns
         -------
         Returns an iterator of (string, bytes) tuples; the iterator may return keys
-        in a different order than were passed in.
+        in a different order than were passed in. If is_transfer is True, the tuple
+        has three elements with the third one being the metadata.
         """
         load_paths = []
         for key in keys:
@@ -127,7 +151,11 @@ class ContentAddressedStore(object):
             if self._blob_cache:
                 blob = self._blob_cache.load_key(key)
             if blob is not None:
-                yield key, blob
+                if is_transfer:
+                    # Cached blobs are decompressed/processed bytes regardless of original format
+                    yield key, blob, {"cas_raw": False, "cas_version": 1}
+                else:
+                    yield key, blob
             else:
                 path = self._storage_impl.path_join(self._prefix, key[:2], key)
                 load_paths.append((key, path))
@@ -169,7 +197,10 @@ class ContentAddressedStore(object):
                 if self._blob_cache:
                     self._blob_cache.store_key(key, blob)
-                yield key, blob
+                if is_transfer:
+                    yield key, blob, meta  # Preserve exact original metadata from storage
+                else:
+                    yield key, blob
     def _unpack_backward_compatible(self, blob):
         # This is the backward compatible unpack

metaflow/datastore/datastore_set.py CHANGED Viewed

@@ -21,9 +21,18 @@ class TaskDataStoreSet(object):
         pathspecs=None,
         prefetch_data_artifacts=None,
         allow_not_done=False,
+        join_type=None,
+        orig_flow_datastore=None,
+        spin_artifacts=None,
     ):
         self.task_datastores = flow_datastore.get_task_datastores(
-            run_id, steps=steps, pathspecs=pathspecs, allow_not_done=allow_not_done
+            run_id,
+            steps=steps,
+            pathspecs=pathspecs,
+            allow_not_done=allow_not_done,
+            join_type=join_type,
+            orig_flow_datastore=orig_flow_datastore,
+            spin_artifacts=spin_artifacts,
         )
         if prefetch_data_artifacts:

metaflow 2.18.12__py2.py3-none-any.whl → 2.19.0__py2.py3-none-any.whl

metaflow 2.18.12py2.py3-none-any.whl → 2.19.0py2.py3-none-any.whl