metaflow 2.18.12__py2.py3-none-any.whl → 2.19.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. metaflow/__init__.py +1 -0
  2. metaflow/cli.py +78 -13
  3. metaflow/cli_components/run_cmds.py +182 -39
  4. metaflow/cli_components/step_cmd.py +160 -4
  5. metaflow/client/__init__.py +1 -0
  6. metaflow/client/core.py +162 -99
  7. metaflow/client/filecache.py +59 -32
  8. metaflow/cmd/code/__init__.py +2 -1
  9. metaflow/datastore/__init__.py +1 -0
  10. metaflow/datastore/content_addressed_store.py +40 -9
  11. metaflow/datastore/datastore_set.py +10 -1
  12. metaflow/datastore/flow_datastore.py +123 -4
  13. metaflow/datastore/spin_datastore.py +91 -0
  14. metaflow/datastore/task_datastore.py +86 -2
  15. metaflow/decorators.py +75 -6
  16. metaflow/extension_support/__init__.py +372 -305
  17. metaflow/flowspec.py +3 -2
  18. metaflow/graph.py +2 -2
  19. metaflow/metaflow_config.py +41 -0
  20. metaflow/metaflow_profile.py +18 -0
  21. metaflow/packaging_sys/utils.py +2 -39
  22. metaflow/packaging_sys/v1.py +63 -16
  23. metaflow/plugins/__init__.py +2 -0
  24. metaflow/plugins/argo/argo_workflows.py +20 -25
  25. metaflow/plugins/argo/param_val.py +19 -0
  26. metaflow/plugins/cards/card_datastore.py +13 -13
  27. metaflow/plugins/cards/card_decorator.py +1 -0
  28. metaflow/plugins/cards/card_modules/basic.py +9 -3
  29. metaflow/plugins/datastores/local_storage.py +12 -6
  30. metaflow/plugins/datastores/spin_storage.py +12 -0
  31. metaflow/plugins/datatools/s3/s3.py +29 -10
  32. metaflow/plugins/datatools/s3/s3op.py +90 -62
  33. metaflow/plugins/metadata_providers/local.py +76 -82
  34. metaflow/plugins/metadata_providers/spin.py +16 -0
  35. metaflow/runner/click_api.py +4 -2
  36. metaflow/runner/metaflow_runner.py +210 -19
  37. metaflow/runtime.py +348 -21
  38. metaflow/task.py +61 -12
  39. metaflow/user_configs/config_parameters.py +2 -4
  40. metaflow/user_decorators/mutable_flow.py +1 -1
  41. metaflow/user_decorators/user_step_decorator.py +10 -1
  42. metaflow/util.py +191 -1
  43. metaflow/version.py +1 -1
  44. {metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Makefile +10 -0
  45. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/METADATA +2 -4
  46. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/RECORD +52 -48
  47. {metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Tiltfile +0 -0
  48. {metaflow-2.18.12.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
  49. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/WHEEL +0 -0
  50. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/entry_points.txt +0 -0
  51. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/licenses/LICENSE +0 -0
  52. {metaflow-2.18.12.dist-info → metaflow-2.19.0.dist-info}/top_level.txt +0 -0
metaflow/client/core.py CHANGED
@@ -207,6 +207,20 @@ def default_namespace() -> str:
207
207
  return get_namespace()
208
208
 
209
209
 
210
+ def inspect_spin(datastore_root: str = "."):
211
+ """
212
+ Set metadata provider to spin metadata so that users can inspect spin
213
+ steps, tasks, and artifacts.
214
+
215
+ Parameters
216
+ ----------
217
+ datastore_root : str, default "."
218
+ The root path to the spin datastore.
219
+ """
220
+ metadata_str = f"spin@{datastore_root}"
221
+ metadata(metadata_str)
222
+
223
+
210
224
  MetaflowArtifacts = NamedTuple
211
225
 
212
226
 
@@ -277,6 +291,7 @@ class MetaflowObject(object):
277
291
  self._attempt = attempt
278
292
  self._current_namespace = _current_namespace or get_namespace()
279
293
  self._namespace_check = _namespace_check
294
+
280
295
  # If the current namespace is False, we disable checking for namespace for this
281
296
  # and all children objects. Not setting namespace_check to False has the consequence
282
297
  # of preventing access to children objects after the namespace changes
@@ -1189,149 +1204,197 @@ class Task(MetaflowObject):
1189
1204
  _PARENT_CLASS = "step"
1190
1205
  _CHILD_CLASS = "artifact"
1191
1206
 
1192
- def __init__(self, *args, **kwargs):
1193
- super(Task, self).__init__(*args, **kwargs)
1194
-
1195
1207
  def _iter_filter(self, x):
1196
1208
  # exclude private data artifacts
1197
1209
  return x.id[0] != "_"
1198
1210
 
1199
- def _iter_matching_tasks(self, steps, metadata_key, metadata_pattern):
1211
+ def _get_matching_pathspecs(self, steps, metadata_key, metadata_pattern):
1200
1212
  """
1201
- Yield tasks from specified steps matching a foreach path pattern.
1213
+ Yield pathspecs of tasks from specified steps that match a given metadata pattern.
1202
1214
 
1203
1215
  Parameters
1204
1216
  ----------
1205
1217
  steps : List[str]
1206
- List of step names to search for tasks
1207
- pattern : str
1208
- Regex pattern to match foreach-indices metadata
1218
+ List of Step objects to search for tasks.
1219
+ metadata_key : str
1220
+ Metadata key to filter tasks on (e.g., 'foreach-execution-path').
1221
+ metadata_pattern : str
1222
+ Regular expression pattern to match against the metadata value.
1209
1223
 
1210
- Returns
1211
- -------
1212
- Iterator[Task]
1213
- Tasks matching the foreach path pattern
1224
+ Yields
1225
+ ------
1226
+ str
1227
+ Pathspec of each task whose metadata value for the specified key matches the pattern.
1214
1228
  """
1215
1229
  flow_id, run_id, _, _ = self.path_components
1216
-
1217
1230
  for step in steps:
1218
1231
  task_pathspecs = self._metaflow.metadata.filter_tasks_by_metadata(
1219
- flow_id, run_id, step.id, metadata_key, metadata_pattern
1232
+ flow_id, run_id, step, metadata_key, metadata_pattern
1220
1233
  )
1221
1234
  for task_pathspec in task_pathspecs:
1222
- yield Task(pathspec=task_pathspec, _namespace_check=False)
1235
+ yield task_pathspec
1236
+
1237
+ @staticmethod
1238
+ def _get_previous_steps(graph_info, step_name):
1239
+ # Get the parent steps
1240
+ steps = []
1241
+ for node_name, attributes in graph_info["steps"].items():
1242
+ if step_name in attributes["next"]:
1243
+ steps.append(node_name)
1244
+ return steps
1223
1245
 
1224
1246
  @property
1225
- def parent_tasks(self) -> Iterator["Task"]:
1247
+ def parent_task_pathspecs(self) -> Iterator[str]:
1226
1248
  """
1227
- Yields all parent tasks of the current task if one exists.
1249
+ Yields pathspecs of all parent tasks of the current task.
1228
1250
 
1229
1251
  Yields
1230
1252
  ------
1231
- Task
1232
- Parent task of the current task
1233
-
1253
+ str
1254
+ Pathspec of the parent task of the current task
1234
1255
  """
1235
- flow_id, run_id, _, _ = self.path_components
1256
+ _, _, step_name, _ = self.path_components
1257
+ metadata_dict = self.metadata_dict
1258
+ graph_info = self["_graph_info"].data
1236
1259
 
1237
- steps = list(self.parent.parent_steps)
1238
- if not steps:
1239
- return []
1240
-
1241
- current_path = self.metadata_dict.get("foreach-execution-path", "")
1260
+ # Get the parent steps
1261
+ steps = self._get_previous_steps(graph_info, step_name)
1262
+ node_type = graph_info["steps"][step_name]["type"]
1263
+ metadata_key = "foreach-execution-path"
1264
+ current_path = metadata_dict.get(metadata_key)
1242
1265
 
1243
1266
  if len(steps) > 1:
1244
1267
  # Static join - use exact path matching
1245
1268
  pattern = current_path or ".*"
1246
- yield from self._iter_matching_tasks(
1247
- steps, "foreach-execution-path", pattern
1248
- )
1249
- return
1250
-
1251
- # Handle single step case
1252
- target_task = Step(
1253
- f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
1254
- ).task
1255
- target_path = target_task.metadata_dict.get("foreach-execution-path")
1256
-
1257
- if not target_path or not current_path:
1258
- # (Current task, "A:10") and (Parent task, "")
1259
- # Pattern: ".*"
1260
- pattern = ".*"
1261
1269
  else:
1262
- current_depth = len(current_path.split(","))
1263
- target_depth = len(target_path.split(","))
1264
-
1265
- if current_depth < target_depth:
1266
- # Foreach join
1267
- # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
1268
- # Pattern: "A:10,B:13,.*"
1269
- pattern = f"{current_path},.*"
1270
- else:
1271
- # Foreach split or linear step
1272
- # Option 1:
1273
- # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
1274
- # Option 2:
1275
- # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
1276
- # Pattern: "A:10,B:13"
1277
- pattern = ",".join(current_path.split(",")[:target_depth])
1270
+ if not steps:
1271
+ return # No parent steps, yield nothing
1278
1272
 
1279
- yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
1273
+ if not current_path:
1274
+ # Current task is not part of a foreach
1275
+ # Pattern: ".*"
1276
+ pattern = ".*"
1277
+ else:
1278
+ current_depth = len(current_path.split(","))
1279
+ if node_type == "join":
1280
+ # Foreach join
1281
+ # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
1282
+ # Pattern: "A:10,B:13,.*"
1283
+ pattern = f"{current_path},.*"
1284
+ else:
1285
+ # Foreach split or linear step
1286
+ # Pattern: "A:10,B:13"
1287
+ parent_step_type = graph_info["steps"][steps[0]]["type"]
1288
+ target_depth = current_depth
1289
+ if (
1290
+ parent_step_type == "split-foreach"
1291
+ or parent_step_type == "split-parallel"
1292
+ ) and current_depth == 1:
1293
+ # (Current task, "A:10") and (Parent task, "")
1294
+ pattern = ".*"
1295
+ else:
1296
+ # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
1297
+ # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
1298
+ if (
1299
+ parent_step_type == "split-foreach"
1300
+ or parent_step_type == "split-parallel"
1301
+ ):
1302
+ target_depth = current_depth - 1
1303
+ pattern = ",".join(current_path.split(",")[:target_depth])
1304
+
1305
+ for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
1306
+ yield pathspec
1280
1307
 
1281
1308
  @property
1282
- def child_tasks(self) -> Iterator["Task"]:
1309
+ def child_task_pathspecs(self) -> Iterator[str]:
1283
1310
  """
1284
- Yield all child tasks of the current task if one exists.
1311
+ Yields pathspecs of all child tasks of the current task.
1285
1312
 
1286
1313
  Yields
1287
1314
  ------
1288
- Task
1289
- Child task of the current task
1315
+ str
1316
+ Pathspec of the child task of the current task
1290
1317
  """
1291
- flow_id, run_id, _, _ = self.path_components
1292
- steps = list(self.parent.child_steps)
1293
- if not steps:
1294
- return []
1318
+ flow_id, run_id, step_name, _ = self.path_components
1319
+ metadata_dict = self.metadata_dict
1320
+ graph_info = self["_graph_info"].data
1321
+
1322
+ # Get the child steps
1323
+ steps = graph_info["steps"][step_name]["next"]
1295
1324
 
1296
- current_path = self.metadata_dict.get("foreach-execution-path", "")
1325
+ node_type = graph_info["steps"][step_name]["type"]
1326
+ metadata_key = "foreach-execution-path"
1327
+ current_path = metadata_dict.get(metadata_key)
1297
1328
 
1298
1329
  if len(steps) > 1:
1299
1330
  # Static split - use exact path matching
1300
1331
  pattern = current_path or ".*"
1301
- yield from self._iter_matching_tasks(
1302
- steps, "foreach-execution-path", pattern
1303
- )
1304
- return
1305
-
1306
- # Handle single step case
1307
- target_task = Step(
1308
- f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
1309
- ).task
1310
- target_path = target_task.metadata_dict.get("foreach-execution-path")
1311
-
1312
- if not target_path or not current_path:
1313
- # (Current task, "A:10") and (Child task, "")
1314
- # Pattern: ".*"
1315
- pattern = ".*"
1316
1332
  else:
1317
- current_depth = len(current_path.split(","))
1318
- target_depth = len(target_path.split(","))
1319
-
1320
- if current_depth < target_depth:
1321
- # Foreach split
1322
- # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
1323
- # Pattern: "A:10,B:13,.*"
1324
- pattern = f"{current_path},.*"
1333
+ if not steps:
1334
+ return # No child steps, yield nothing
1335
+
1336
+ if not current_path:
1337
+ # Current task is not part of a foreach
1338
+ # Pattern: ".*"
1339
+ pattern = ".*"
1325
1340
  else:
1326
- # Foreach join or linear step
1327
- # Option 1:
1328
- # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
1329
- # Option 2:
1330
- # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
1331
- # Pattern: "A:10,B:13"
1332
- pattern = ",".join(current_path.split(",")[:target_depth])
1333
-
1334
- yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
1341
+ current_depth = len(current_path.split(","))
1342
+ if node_type == "split-foreach" or node_type == "split-parallel":
1343
+ # Foreach split
1344
+ # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
1345
+ # Pattern: "A:10,B:13,.*"
1346
+ pattern = f"{current_path},.*"
1347
+ else:
1348
+ # Foreach join or linear step
1349
+ # Pattern: "A:10,B:13"
1350
+ child_step_type = graph_info["steps"][steps[0]]["type"]
1351
+
1352
+ # We need to know if the child step is a foreach join or a static join
1353
+ child_step_prev_steps = self._get_previous_steps(
1354
+ graph_info, steps[0]
1355
+ )
1356
+ if len(child_step_prev_steps) > 1:
1357
+ child_step_type = "static-join"
1358
+ target_depth = current_depth
1359
+ if child_step_type == "join" and current_depth == 1:
1360
+ # (Current task, "A:10") and (Child task, "")
1361
+ pattern = ".*"
1362
+ else:
1363
+ # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
1364
+ # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
1365
+ if child_step_type == "join":
1366
+ target_depth = current_depth - 1
1367
+ pattern = ",".join(current_path.split(",")[:target_depth])
1368
+
1369
+ for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
1370
+ yield pathspec
1371
+
1372
+ @property
1373
+ def parent_tasks(self) -> Iterator["Task"]:
1374
+ """
1375
+ Yields all parent tasks of the current task if one exists.
1376
+
1377
+ Yields
1378
+ ------
1379
+ Task
1380
+ Parent task of the current task
1381
+ """
1382
+ parent_task_pathspecs = self.parent_task_pathspecs
1383
+ for pathspec in parent_task_pathspecs:
1384
+ yield Task(pathspec=pathspec, _namespace_check=False)
1385
+
1386
+ @property
1387
+ def child_tasks(self) -> Iterator["Task"]:
1388
+ """
1389
+ Yields all child tasks of the current task if one exists.
1390
+
1391
+ Yields
1392
+ ------
1393
+ Task
1394
+ Child task of the current task
1395
+ """
1396
+ for pathspec in self.child_task_pathspecs:
1397
+ yield Task(pathspec=pathspec, _namespace_check=False)
1335
1398
 
1336
1399
  @property
1337
1400
  def metadata(self) -> List[Metadata]:
@@ -1,5 +1,6 @@
1
1
  from __future__ import print_function
2
2
  from collections import OrderedDict
3
+ import json
3
4
  import os
4
5
  import sys
5
6
  import time
@@ -10,13 +11,14 @@ from urllib.parse import urlparse
10
11
 
11
12
  from metaflow.datastore import FlowDataStore
12
13
  from metaflow.datastore.content_addressed_store import BlobCache
14
+ from metaflow.datastore.flow_datastore import MetadataCache
13
15
  from metaflow.exception import MetaflowException
14
16
  from metaflow.metaflow_config import (
15
17
  CLIENT_CACHE_PATH,
16
18
  CLIENT_CACHE_MAX_SIZE,
17
19
  CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT,
18
- CLIENT_CACHE_MAX_TASKDATASTORE_COUNT,
19
20
  )
21
+ from metaflow.metaflow_profile import from_start
20
22
 
21
23
  from metaflow.plugins import DATASTORES
22
24
 
@@ -63,8 +65,8 @@ class FileCache(object):
63
65
  # when querying for sizes of artifacts. Once we have queried for the size
64
66
  # of one artifact in a TaskDatastore, caching this means that any
65
67
  # queries on that same TaskDatastore will be quick (since we already
66
- # have all the metadata)
67
- self._task_metadata_caches = OrderedDict()
68
+ # have all the metadata). We keep track of this in a file so it persists
69
+ # across processes.
68
70
 
69
71
  @property
70
72
  def cache_dir(self):
@@ -87,7 +89,7 @@ class FileCache(object):
87
89
  ):
88
90
  ds_cls = self._get_datastore_storage_impl(ds_type)
89
91
  ds_root = ds_cls.path_join(*ds_cls.path_split(location)[:-5])
90
- cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
92
+ cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
91
93
 
92
94
  token = (
93
95
  "%s.cached"
@@ -311,13 +313,13 @@ class FileCache(object):
311
313
  self._objects = sorted(objects, reverse=False)
312
314
 
313
315
  @staticmethod
314
- def _flow_ds_id(ds_type, ds_root, flow_name):
316
+ def flow_ds_id(ds_type, ds_root, flow_name):
315
317
  p = urlparse(ds_root)
316
318
  sanitized_root = (p.netloc + p.path).replace("/", "_")
317
319
  return ".".join([ds_type, sanitized_root, flow_name])
318
320
 
319
321
  @staticmethod
320
- def _task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
322
+ def task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
321
323
  p = urlparse(ds_root)
322
324
  sanitized_root = (p.netloc + p.path).replace("/", "_")
323
325
  return ".".join(
@@ -365,7 +367,7 @@ class FileCache(object):
365
367
  return storage_impl[0]
366
368
 
367
369
  def _get_flow_datastore(self, ds_type, ds_root, flow_name):
368
- cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
370
+ cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
369
371
  cached_flow_datastore = self._store_caches.get(cache_id)
370
372
 
371
373
  if cached_flow_datastore:
@@ -380,9 +382,14 @@ class FileCache(object):
380
382
  ds_root=ds_root,
381
383
  )
382
384
  blob_cache = self._blob_caches.setdefault(
383
- cache_id, FileBlobCache(self, cache_id)
385
+ cache_id,
386
+ (
387
+ FileBlobCache(self, cache_id),
388
+ TaskMetadataCache(self, ds_type, ds_root, flow_name),
389
+ ),
384
390
  )
385
- cached_flow_datastore.ca_store.set_blob_cache(blob_cache)
391
+ cached_flow_datastore.ca_store.set_blob_cache(blob_cache[0])
392
+ cached_flow_datastore.set_metadata_cache(blob_cache[1])
386
393
  self._store_caches[cache_id] = cached_flow_datastore
387
394
  if len(self._store_caches) > CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT:
388
395
  cache_id_to_remove, _ = self._store_caches.popitem(last=False)
@@ -393,32 +400,52 @@ class FileCache(object):
393
400
  self, ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
394
401
  ):
395
402
  flow_ds = self._get_flow_datastore(ds_type, ds_root, flow_name)
396
- cached_metadata = None
397
- if attempt is not None:
398
- cache_id = self._task_ds_id(
399
- ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
403
+
404
+ return flow_ds.get_task_datastore(run_id, step_name, task_id, attempt=attempt)
405
+
406
+
407
+ class TaskMetadataCache(MetadataCache):
408
+ def __init__(self, filecache, ds_type, ds_root, flow_name):
409
+ self._filecache = filecache
410
+ self._ds_type = ds_type
411
+ self._ds_root = ds_root
412
+ self._flow_name = flow_name
413
+
414
+ def _path(self, run_id, step_name, task_id, attempt):
415
+ if attempt is None:
416
+ raise MetaflowException(
417
+ "Attempt number must be specified to use task metadata cache. Raise an issue "
418
+ "on Metaflow GitHub if you see this message.",
400
419
  )
401
- cached_metadata = self._task_metadata_caches.get(cache_id)
402
- if cached_metadata:
403
- od_move_to_end(self._task_metadata_caches, cache_id)
404
- return flow_ds.get_task_datastore(
405
- run_id,
406
- step_name,
407
- task_id,
408
- attempt=attempt,
409
- data_metadata=cached_metadata,
410
- )
411
- # If we are here, we either have attempt=None or nothing in the cache
412
- task_ds = flow_ds.get_task_datastore(
413
- run_id, step_name, task_id, attempt=attempt
420
+ cache_id = self._filecache.task_ds_id(
421
+ self._ds_type,
422
+ self._ds_root,
423
+ self._flow_name,
424
+ run_id,
425
+ step_name,
426
+ task_id,
427
+ attempt,
414
428
  )
415
- cache_id = self._task_ds_id(
416
- ds_type, ds_root, flow_name, run_id, step_name, task_id, task_ds.attempt
429
+ token = (
430
+ "%s.cached"
431
+ % sha1(
432
+ os.path.join(
433
+ run_id, step_name, task_id, str(attempt), "metadata"
434
+ ).encode("utf-8")
435
+ ).hexdigest()
436
+ )
437
+ return os.path.join(self._filecache.cache_dir, cache_id, token[:2], token)
438
+
439
+ def load_metadata(self, run_id, step_name, task_id, attempt):
440
+ d = self._filecache.read_file(self._path(run_id, step_name, task_id, attempt))
441
+ if d:
442
+ return json.loads(d)
443
+
444
+ def store_metadata(self, run_id, step_name, task_id, attempt, metadata_dict):
445
+ self._filecache.create_file(
446
+ self._path(run_id, step_name, task_id, attempt),
447
+ json.dumps(metadata_dict).encode("utf-8"),
417
448
  )
418
- self._task_metadata_caches[cache_id] = task_ds.ds_metadata
419
- if len(self._task_metadata_caches) > CLIENT_CACHE_MAX_TASKDATASTORE_COUNT:
420
- self._task_metadata_caches.popitem(last=False)
421
- return task_ds
422
449
 
423
450
 
424
451
  class FileBlobCache(BlobCache):
@@ -6,6 +6,7 @@ from tempfile import TemporaryDirectory
6
6
  from typing import Any, Callable, List, Mapping, Optional, cast
7
7
 
8
8
  from metaflow import Run
9
+ from metaflow.util import walk_without_cycles
9
10
  from metaflow._vendor import click
10
11
  from metaflow.cli import echo_always
11
12
 
@@ -51,7 +52,7 @@ def perform_diff(
51
52
  target_dir = os.getcwd()
52
53
 
53
54
  diffs = []
54
- for dirpath, dirnames, filenames in os.walk(source_dir, followlinks=True):
55
+ for dirpath, _, filenames in walk_without_cycles(source_dir):
55
56
  for fname in filenames:
56
57
  # NOTE: the paths below need to be set up carefully
57
58
  # for the `patch` command to work. Better not to touch
@@ -2,3 +2,4 @@ from .inputs import Inputs
2
2
  from .flow_datastore import FlowDataStore
3
3
  from .datastore_set import TaskDataStoreSet
4
4
  from .task_datastore import TaskDataStore
5
+ from .spin_datastore import SpinTaskDatastore
@@ -38,7 +38,7 @@ class ContentAddressedStore(object):
38
38
  def set_blob_cache(self, blob_cache):
39
39
  self._blob_cache = blob_cache
40
40
 
41
- def save_blobs(self, blob_iter, raw=False, len_hint=0):
41
+ def save_blobs(self, blob_iter, raw=False, len_hint=0, is_transfer=False):
42
42
  """
43
43
  Saves blobs of data to the datastore
44
44
 
@@ -60,11 +60,16 @@ class ContentAddressedStore(object):
60
60
 
61
61
  Parameters
62
62
  ----------
63
- blob_iter : Iterator over bytes objects to save
64
- raw : bool, optional
63
+ blob_iter : Iterator
64
+ Iterator over bytes objects to save
65
+ raw : bool, default False
65
66
  Whether to save the bytes directly or process them, by default False
66
- len_hint : Hint of the number of blobs that will be produced by the
67
+ len_hint : int, default 0
68
+ Hint of the number of blobs that will be produced by the
67
69
  iterator, by default 0
70
+ is_transfer : bool, default False
71
+ If True, this indicates we are saving blobs directly from the output of another
72
+ content addressed store's
68
73
 
69
74
  Returns
70
75
  -------
@@ -76,6 +81,20 @@ class ContentAddressedStore(object):
76
81
 
77
82
  def packing_iter():
78
83
  for blob in blob_iter:
84
+ if is_transfer:
85
+ key, blob_data, meta = blob
86
+ path = self._storage_impl.path_join(self._prefix, key[:2], key)
87
+ # Transfer data is always raw/decompressed, so mark it as such
88
+ meta_corrected = {"cas_raw": True, "cas_version": 1}
89
+
90
+ results.append(
91
+ self.save_blobs_result(
92
+ uri=self._storage_impl.full_uri(path),
93
+ key=key,
94
+ )
95
+ )
96
+ yield path, (BytesIO(blob_data), meta_corrected)
97
+ continue
79
98
  sha = sha1(blob).hexdigest()
80
99
  path = self._storage_impl.path_join(self._prefix, sha[:2], sha)
81
100
  results.append(
@@ -100,7 +119,7 @@ class ContentAddressedStore(object):
100
119
  self._storage_impl.save_bytes(packing_iter(), overwrite=True, len_hint=len_hint)
101
120
  return results
102
121
 
103
- def load_blobs(self, keys, force_raw=False):
122
+ def load_blobs(self, keys, force_raw=False, is_transfer=False):
104
123
  """
105
124
  Mirror function of save_blobs
106
125
 
@@ -111,15 +130,20 @@ class ContentAddressedStore(object):
111
130
  ----------
112
131
  keys : List of string
113
132
  Key describing the object to load
114
- force_raw : bool, optional
133
+ force_raw : bool, default False
115
134
  Support for backward compatibility with previous datastores. If
116
135
  True, this will force the key to be loaded as is (raw). By default,
117
136
  False
137
+ is_transfer : bool, default False
138
+ If True, this indicates we are loading blobs to transfer them directly
139
+ to another datastore. We will, in this case, also transfer the metadata
140
+ and do minimal processing. This is for internal use only.
118
141
 
119
142
  Returns
120
143
  -------
121
144
  Returns an iterator of (string, bytes) tuples; the iterator may return keys
122
- in a different order than were passed in.
145
+ in a different order than were passed in. If is_transfer is True, the tuple
146
+ has three elements with the third one being the metadata.
123
147
  """
124
148
  load_paths = []
125
149
  for key in keys:
@@ -127,7 +151,11 @@ class ContentAddressedStore(object):
127
151
  if self._blob_cache:
128
152
  blob = self._blob_cache.load_key(key)
129
153
  if blob is not None:
130
- yield key, blob
154
+ if is_transfer:
155
+ # Cached blobs are decompressed/processed bytes regardless of original format
156
+ yield key, blob, {"cas_raw": False, "cas_version": 1}
157
+ else:
158
+ yield key, blob
131
159
  else:
132
160
  path = self._storage_impl.path_join(self._prefix, key[:2], key)
133
161
  load_paths.append((key, path))
@@ -169,7 +197,10 @@ class ContentAddressedStore(object):
169
197
  if self._blob_cache:
170
198
  self._blob_cache.store_key(key, blob)
171
199
 
172
- yield key, blob
200
+ if is_transfer:
201
+ yield key, blob, meta # Preserve exact original metadata from storage
202
+ else:
203
+ yield key, blob
173
204
 
174
205
  def _unpack_backward_compatible(self, blob):
175
206
  # This is the backward compatible unpack
@@ -21,9 +21,18 @@ class TaskDataStoreSet(object):
21
21
  pathspecs=None,
22
22
  prefetch_data_artifacts=None,
23
23
  allow_not_done=False,
24
+ join_type=None,
25
+ orig_flow_datastore=None,
26
+ spin_artifacts=None,
24
27
  ):
25
28
  self.task_datastores = flow_datastore.get_task_datastores(
26
- run_id, steps=steps, pathspecs=pathspecs, allow_not_done=allow_not_done
29
+ run_id,
30
+ steps=steps,
31
+ pathspecs=pathspecs,
32
+ allow_not_done=allow_not_done,
33
+ join_type=join_type,
34
+ orig_flow_datastore=orig_flow_datastore,
35
+ spin_artifacts=spin_artifacts,
27
36
  )
28
37
 
29
38
  if prefetch_data_artifacts: