ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. metaflow/__init__.py +10 -3
  2. metaflow/_vendor/imghdr/__init__.py +186 -0
  3. metaflow/_vendor/yaml/__init__.py +427 -0
  4. metaflow/_vendor/yaml/composer.py +139 -0
  5. metaflow/_vendor/yaml/constructor.py +748 -0
  6. metaflow/_vendor/yaml/cyaml.py +101 -0
  7. metaflow/_vendor/yaml/dumper.py +62 -0
  8. metaflow/_vendor/yaml/emitter.py +1137 -0
  9. metaflow/_vendor/yaml/error.py +75 -0
  10. metaflow/_vendor/yaml/events.py +86 -0
  11. metaflow/_vendor/yaml/loader.py +63 -0
  12. metaflow/_vendor/yaml/nodes.py +49 -0
  13. metaflow/_vendor/yaml/parser.py +589 -0
  14. metaflow/_vendor/yaml/reader.py +185 -0
  15. metaflow/_vendor/yaml/representer.py +389 -0
  16. metaflow/_vendor/yaml/resolver.py +227 -0
  17. metaflow/_vendor/yaml/scanner.py +1435 -0
  18. metaflow/_vendor/yaml/serializer.py +111 -0
  19. metaflow/_vendor/yaml/tokens.py +104 -0
  20. metaflow/cards.py +4 -0
  21. metaflow/cli.py +125 -21
  22. metaflow/cli_components/init_cmd.py +1 -0
  23. metaflow/cli_components/run_cmds.py +204 -40
  24. metaflow/cli_components/step_cmd.py +160 -4
  25. metaflow/client/__init__.py +1 -0
  26. metaflow/client/core.py +198 -130
  27. metaflow/client/filecache.py +59 -32
  28. metaflow/cmd/code/__init__.py +2 -1
  29. metaflow/cmd/develop/stub_generator.py +49 -18
  30. metaflow/cmd/develop/stubs.py +9 -27
  31. metaflow/cmd/make_wrapper.py +30 -0
  32. metaflow/datastore/__init__.py +1 -0
  33. metaflow/datastore/content_addressed_store.py +40 -9
  34. metaflow/datastore/datastore_set.py +10 -1
  35. metaflow/datastore/flow_datastore.py +124 -4
  36. metaflow/datastore/spin_datastore.py +91 -0
  37. metaflow/datastore/task_datastore.py +92 -6
  38. metaflow/debug.py +5 -0
  39. metaflow/decorators.py +331 -82
  40. metaflow/extension_support/__init__.py +414 -356
  41. metaflow/extension_support/_empty_file.py +2 -2
  42. metaflow/flowspec.py +322 -82
  43. metaflow/graph.py +178 -15
  44. metaflow/includefile.py +25 -3
  45. metaflow/lint.py +94 -3
  46. metaflow/meta_files.py +13 -0
  47. metaflow/metadata_provider/metadata.py +13 -2
  48. metaflow/metaflow_config.py +66 -4
  49. metaflow/metaflow_environment.py +91 -25
  50. metaflow/metaflow_profile.py +18 -0
  51. metaflow/metaflow_version.py +16 -1
  52. metaflow/package/__init__.py +673 -0
  53. metaflow/packaging_sys/__init__.py +880 -0
  54. metaflow/packaging_sys/backend.py +128 -0
  55. metaflow/packaging_sys/distribution_support.py +153 -0
  56. metaflow/packaging_sys/tar_backend.py +99 -0
  57. metaflow/packaging_sys/utils.py +54 -0
  58. metaflow/packaging_sys/v1.py +527 -0
  59. metaflow/parameters.py +6 -2
  60. metaflow/plugins/__init__.py +6 -0
  61. metaflow/plugins/airflow/airflow.py +11 -1
  62. metaflow/plugins/airflow/airflow_cli.py +16 -5
  63. metaflow/plugins/argo/argo_client.py +42 -20
  64. metaflow/plugins/argo/argo_events.py +6 -6
  65. metaflow/plugins/argo/argo_workflows.py +1023 -344
  66. metaflow/plugins/argo/argo_workflows_cli.py +396 -94
  67. metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
  68. metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
  69. metaflow/plugins/argo/capture_error.py +5 -2
  70. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  71. metaflow/plugins/argo/exit_hooks.py +209 -0
  72. metaflow/plugins/argo/param_val.py +19 -0
  73. metaflow/plugins/aws/aws_client.py +6 -0
  74. metaflow/plugins/aws/aws_utils.py +33 -1
  75. metaflow/plugins/aws/batch/batch.py +72 -5
  76. metaflow/plugins/aws/batch/batch_cli.py +24 -3
  77. metaflow/plugins/aws/batch/batch_decorator.py +57 -6
  78. metaflow/plugins/aws/step_functions/step_functions.py +28 -3
  79. metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
  80. metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
  81. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
  82. metaflow/plugins/cards/card_cli.py +20 -1
  83. metaflow/plugins/cards/card_creator.py +24 -1
  84. metaflow/plugins/cards/card_datastore.py +21 -49
  85. metaflow/plugins/cards/card_decorator.py +58 -6
  86. metaflow/plugins/cards/card_modules/basic.py +38 -9
  87. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  88. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  89. metaflow/plugins/cards/card_modules/components.py +592 -3
  90. metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
  91. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  92. metaflow/plugins/cards/card_modules/main.css +1 -0
  93. metaflow/plugins/cards/card_modules/main.js +56 -41
  94. metaflow/plugins/cards/card_modules/test_cards.py +22 -6
  95. metaflow/plugins/cards/component_serializer.py +1 -8
  96. metaflow/plugins/cards/metadata.py +22 -0
  97. metaflow/plugins/catch_decorator.py +9 -0
  98. metaflow/plugins/datastores/local_storage.py +12 -6
  99. metaflow/plugins/datastores/spin_storage.py +12 -0
  100. metaflow/plugins/datatools/s3/s3.py +49 -17
  101. metaflow/plugins/datatools/s3/s3op.py +113 -66
  102. metaflow/plugins/env_escape/client_modules.py +102 -72
  103. metaflow/plugins/events_decorator.py +127 -121
  104. metaflow/plugins/exit_hook/__init__.py +0 -0
  105. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  106. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  107. metaflow/plugins/kubernetes/kubernetes.py +12 -1
  108. metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
  109. metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
  110. metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
  111. metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
  112. metaflow/plugins/metadata_providers/local.py +76 -82
  113. metaflow/plugins/metadata_providers/service.py +13 -9
  114. metaflow/plugins/metadata_providers/spin.py +16 -0
  115. metaflow/plugins/package_cli.py +36 -24
  116. metaflow/plugins/parallel_decorator.py +11 -2
  117. metaflow/plugins/parsers.py +16 -0
  118. metaflow/plugins/pypi/bootstrap.py +7 -1
  119. metaflow/plugins/pypi/conda_decorator.py +41 -82
  120. metaflow/plugins/pypi/conda_environment.py +14 -6
  121. metaflow/plugins/pypi/micromamba.py +9 -1
  122. metaflow/plugins/pypi/pip.py +41 -5
  123. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  124. metaflow/plugins/pypi/utils.py +22 -0
  125. metaflow/plugins/secrets/__init__.py +3 -0
  126. metaflow/plugins/secrets/secrets_decorator.py +14 -178
  127. metaflow/plugins/secrets/secrets_func.py +49 -0
  128. metaflow/plugins/secrets/secrets_spec.py +101 -0
  129. metaflow/plugins/secrets/utils.py +74 -0
  130. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  131. metaflow/plugins/timeout_decorator.py +0 -1
  132. metaflow/plugins/uv/bootstrap.py +29 -1
  133. metaflow/plugins/uv/uv_environment.py +5 -3
  134. metaflow/pylint_wrapper.py +5 -1
  135. metaflow/runner/click_api.py +79 -26
  136. metaflow/runner/deployer.py +208 -6
  137. metaflow/runner/deployer_impl.py +32 -12
  138. metaflow/runner/metaflow_runner.py +266 -33
  139. metaflow/runner/subprocess_manager.py +21 -1
  140. metaflow/runner/utils.py +27 -16
  141. metaflow/runtime.py +660 -66
  142. metaflow/task.py +255 -26
  143. metaflow/user_configs/config_options.py +33 -21
  144. metaflow/user_configs/config_parameters.py +220 -58
  145. metaflow/user_decorators/__init__.py +0 -0
  146. metaflow/user_decorators/common.py +144 -0
  147. metaflow/user_decorators/mutable_flow.py +512 -0
  148. metaflow/user_decorators/mutable_step.py +424 -0
  149. metaflow/user_decorators/user_flow_decorator.py +264 -0
  150. metaflow/user_decorators/user_step_decorator.py +749 -0
  151. metaflow/util.py +197 -7
  152. metaflow/vendor.py +23 -7
  153. metaflow/version.py +1 -1
  154. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
  155. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
  156. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
  157. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
  158. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
  159. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  160. metaflow/_vendor/v3_5/__init__.py +0 -1
  161. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  162. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  163. metaflow/_vendor/v3_5/zipp.py +0 -329
  164. metaflow/info_file.py +0 -25
  165. metaflow/package.py +0 -203
  166. metaflow/user_configs/config_decorators.py +0 -568
  167. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
  168. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
  169. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
metaflow/client/core.py CHANGED
@@ -32,11 +32,12 @@ from metaflow.exception import (
32
32
  from metaflow.includefile import IncludedFile
33
33
  from metaflow.metaflow_config import DEFAULT_METADATA, MAX_ATTEMPTS
34
34
  from metaflow.metaflow_environment import MetaflowEnvironment
35
+ from metaflow.package import MetaflowPackage
36
+ from metaflow.packaging_sys import ContentType
35
37
  from metaflow.plugins import ENVIRONMENTS, METADATA_PROVIDERS
36
38
  from metaflow.unbounded_foreach import CONTROL_TASK_TAG
37
39
  from metaflow.util import cached_property, is_stringish, resolve_identity, to_unicode
38
40
 
39
- from ..info_file import INFO_FILE
40
41
  from .filecache import FileCache
41
42
 
42
43
  if TYPE_CHECKING:
@@ -206,6 +207,20 @@ def default_namespace() -> str:
206
207
  return get_namespace()
207
208
 
208
209
 
210
+ def inspect_spin(datastore_root: str = "."):
211
+ """
212
+ Set metadata provider to spin metadata so that users can inspect spin
213
+ steps, tasks, and artifacts.
214
+
215
+ Parameters
216
+ ----------
217
+ datastore_root : str, default "."
218
+ The root path to the spin datastore.
219
+ """
220
+ metadata_str = f"spin@{datastore_root}"
221
+ metadata(metadata_str)
222
+
223
+
209
224
  MetaflowArtifacts = NamedTuple
210
225
 
211
226
 
@@ -276,6 +291,7 @@ class MetaflowObject(object):
276
291
  self._attempt = attempt
277
292
  self._current_namespace = _current_namespace or get_namespace()
278
293
  self._namespace_check = _namespace_check
294
+
279
295
  # If the current namespace is False, we disable checking for namespace for this
280
296
  # and all children objects. Not setting namespace_check to False has the consequence
281
297
  # of preventing access to children objects after the namespace changes
@@ -302,7 +318,7 @@ class MetaflowObject(object):
302
318
  # distinguish between "attempt will happen" and "no such
303
319
  # attempt exists".
304
320
 
305
- if pathspec:
321
+ if pathspec and _object is None:
306
322
  ids = pathspec.split("/")
307
323
 
308
324
  if self._NAME == "flow" and len(ids) != 1:
@@ -816,20 +832,29 @@ class MetaflowCode(object):
816
832
  self._path = info["location"]
817
833
  self._ds_type = info["ds_type"]
818
834
  self._sha = info["sha"]
835
+ self._code_metadata = info.get(
836
+ "metadata",
837
+ '{"version": 0, "archive_format": "tgz", "mfcontent_version": 0}',
838
+ )
839
+
840
+ self._backend = MetaflowPackage.get_backend(self._code_metadata)
819
841
 
820
842
  if filecache is None:
821
843
  filecache = FileCache()
822
844
  _, blobdata = filecache.get_data(
823
845
  self._ds_type, self._flow_name, self._path, self._sha
824
846
  )
825
- code_obj = BytesIO(blobdata)
826
- self._tar = tarfile.open(fileobj=code_obj, mode="r:gz")
827
- # The JSON module in Python3 deals with Unicode. Tar gives bytes.
828
- info_str = (
829
- self._tar.extractfile(os.path.basename(INFO_FILE)).read().decode("utf-8")
830
- )
831
- self._info = json.loads(info_str)
832
- self._flowspec = self._tar.extractfile(self._info["script"]).read()
847
+ self._code_obj = BytesIO(blobdata)
848
+ self._info = MetaflowPackage.cls_get_info(self._code_metadata, self._code_obj)
849
+ self._code_obj.seek(0)
850
+ if self._info:
851
+ self._flowspec = MetaflowPackage.cls_get_content(
852
+ self._code_metadata, self._code_obj, self._info["script"]
853
+ )
854
+ self._code_obj.seek(0)
855
+ else:
856
+ raise MetaflowInternalError("Code package metadata is invalid.")
857
+ self._tarball = None
833
858
 
834
859
  @property
835
860
  def path(self) -> str:
@@ -877,7 +902,14 @@ class MetaflowCode(object):
877
902
  TarFile
878
903
  TarFile for everything in this code package
879
904
  """
880
- return self._tar
905
+ # We only return one tarball because the different TarFile objects share
906
+ # a common bytes buffer (self._code_obj).
907
+ if self._tarball is not None:
908
+ return self._tarball
909
+ if self._backend.type == "tgz":
910
+ self._tarball = self._backend.cls_open(self._code_obj)
911
+ return self._tarball
912
+ raise RuntimeError("Archive is not a tarball")
881
913
 
882
914
  def extract(self) -> TemporaryDirectory:
883
915
  """
@@ -908,27 +940,15 @@ class MetaflowCode(object):
908
940
  The directory and its contents are automatically deleted when
909
941
  this object is garbage collected.
910
942
  """
911
- exclusions = [
912
- "metaflow/",
913
- "metaflow_extensions/",
914
- "INFO",
915
- "CONFIG_PARAMETERS",
916
- "conda.manifest",
917
- # This file is created when using the conda/pypi features available in
918
- # nflx-metaflow-extensions: https://github.com/Netflix/metaflow-nflx-extensions
919
- "condav2-1.cnd",
920
- ]
921
- members = [
922
- m
923
- for m in self.tarball.getmembers()
924
- if not any(
925
- (x.endswith("/") and m.name.startswith(x)) or (m.name == x)
926
- for x in exclusions
927
- )
928
- ]
929
-
930
943
  tmp = TemporaryDirectory()
931
- self.tarball.extractall(tmp.name, members)
944
+ # We save the position we are in _code_obj -- in case tarball is using it at
945
+ # the same time -- so we can reset it to not perturb tarball.
946
+ pos = self._code_obj.tell()
947
+ self._code_obj.seek(0)
948
+ MetaflowPackage.cls_extract_into(
949
+ self._code_metadata, self._code_obj, tmp.name, ContentType.USER_CONTENT
950
+ )
951
+ self._code_obj.seek(pos)
932
952
  return tmp
933
953
 
934
954
  @property
@@ -1184,149 +1204,197 @@ class Task(MetaflowObject):
1184
1204
  _PARENT_CLASS = "step"
1185
1205
  _CHILD_CLASS = "artifact"
1186
1206
 
1187
- def __init__(self, *args, **kwargs):
1188
- super(Task, self).__init__(*args, **kwargs)
1189
-
1190
1207
  def _iter_filter(self, x):
1191
1208
  # exclude private data artifacts
1192
1209
  return x.id[0] != "_"
1193
1210
 
1194
- def _iter_matching_tasks(self, steps, metadata_key, metadata_pattern):
1211
+ def _get_matching_pathspecs(self, steps, metadata_key, metadata_pattern):
1195
1212
  """
1196
- Yield tasks from specified steps matching a foreach path pattern.
1213
+ Yield pathspecs of tasks from specified steps that match a given metadata pattern.
1197
1214
 
1198
1215
  Parameters
1199
1216
  ----------
1200
1217
  steps : List[str]
1201
- List of step names to search for tasks
1202
- pattern : str
1203
- Regex pattern to match foreach-indices metadata
1218
+ List of Step objects to search for tasks.
1219
+ metadata_key : str
1220
+ Metadata key to filter tasks on (e.g., 'foreach-execution-path').
1221
+ metadata_pattern : str
1222
+ Regular expression pattern to match against the metadata value.
1204
1223
 
1205
- Returns
1206
- -------
1207
- Iterator[Task]
1208
- Tasks matching the foreach path pattern
1224
+ Yields
1225
+ ------
1226
+ str
1227
+ Pathspec of each task whose metadata value for the specified key matches the pattern.
1209
1228
  """
1210
1229
  flow_id, run_id, _, _ = self.path_components
1211
-
1212
1230
  for step in steps:
1213
1231
  task_pathspecs = self._metaflow.metadata.filter_tasks_by_metadata(
1214
- flow_id, run_id, step.id, metadata_key, metadata_pattern
1232
+ flow_id, run_id, step, metadata_key, metadata_pattern
1215
1233
  )
1216
1234
  for task_pathspec in task_pathspecs:
1217
- yield Task(pathspec=task_pathspec, _namespace_check=False)
1235
+ yield task_pathspec
1236
+
1237
+ @staticmethod
1238
+ def _get_previous_steps(graph_info, step_name):
1239
+ # Get the parent steps
1240
+ steps = []
1241
+ for node_name, attributes in graph_info["steps"].items():
1242
+ if step_name in attributes["next"]:
1243
+ steps.append(node_name)
1244
+ return steps
1218
1245
 
1219
1246
  @property
1220
- def parent_tasks(self) -> Iterator["Task"]:
1247
+ def parent_task_pathspecs(self) -> Iterator[str]:
1221
1248
  """
1222
- Yields all parent tasks of the current task if one exists.
1249
+ Yields pathspecs of all parent tasks of the current task.
1223
1250
 
1224
1251
  Yields
1225
1252
  ------
1226
- Task
1227
- Parent task of the current task
1228
-
1253
+ str
1254
+ Pathspec of the parent task of the current task
1229
1255
  """
1230
- flow_id, run_id, _, _ = self.path_components
1256
+ _, _, step_name, _ = self.path_components
1257
+ metadata_dict = self.metadata_dict
1258
+ graph_info = self["_graph_info"].data
1231
1259
 
1232
- steps = list(self.parent.parent_steps)
1233
- if not steps:
1234
- return []
1235
-
1236
- current_path = self.metadata_dict.get("foreach-execution-path", "")
1260
+ # Get the parent steps
1261
+ steps = self._get_previous_steps(graph_info, step_name)
1262
+ node_type = graph_info["steps"][step_name]["type"]
1263
+ metadata_key = "foreach-execution-path"
1264
+ current_path = metadata_dict.get(metadata_key)
1237
1265
 
1238
1266
  if len(steps) > 1:
1239
1267
  # Static join - use exact path matching
1240
1268
  pattern = current_path or ".*"
1241
- yield from self._iter_matching_tasks(
1242
- steps, "foreach-execution-path", pattern
1243
- )
1244
- return
1245
-
1246
- # Handle single step case
1247
- target_task = Step(
1248
- f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
1249
- ).task
1250
- target_path = target_task.metadata_dict.get("foreach-execution-path")
1251
-
1252
- if not target_path or not current_path:
1253
- # (Current task, "A:10") and (Parent task, "")
1254
- # Pattern: ".*"
1255
- pattern = ".*"
1256
1269
  else:
1257
- current_depth = len(current_path.split(","))
1258
- target_depth = len(target_path.split(","))
1259
-
1260
- if current_depth < target_depth:
1261
- # Foreach join
1262
- # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
1263
- # Pattern: "A:10,B:13,.*"
1264
- pattern = f"{current_path},.*"
1265
- else:
1266
- # Foreach split or linear step
1267
- # Option 1:
1268
- # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
1269
- # Option 2:
1270
- # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
1271
- # Pattern: "A:10,B:13"
1272
- pattern = ",".join(current_path.split(",")[:target_depth])
1270
+ if not steps:
1271
+ return # No parent steps, yield nothing
1273
1272
 
1274
- yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
1273
+ if not current_path:
1274
+ # Current task is not part of a foreach
1275
+ # Pattern: ".*"
1276
+ pattern = ".*"
1277
+ else:
1278
+ current_depth = len(current_path.split(","))
1279
+ if node_type == "join":
1280
+ # Foreach join
1281
+ # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
1282
+ # Pattern: "A:10,B:13,.*"
1283
+ pattern = f"{current_path},.*"
1284
+ else:
1285
+ # Foreach split or linear step
1286
+ # Pattern: "A:10,B:13"
1287
+ parent_step_type = graph_info["steps"][steps[0]]["type"]
1288
+ target_depth = current_depth
1289
+ if (
1290
+ parent_step_type == "split-foreach"
1291
+ or parent_step_type == "split-parallel"
1292
+ ) and current_depth == 1:
1293
+ # (Current task, "A:10") and (Parent task, "")
1294
+ pattern = ".*"
1295
+ else:
1296
+ # (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
1297
+ # (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
1298
+ if (
1299
+ parent_step_type == "split-foreach"
1300
+ or parent_step_type == "split-parallel"
1301
+ ):
1302
+ target_depth = current_depth - 1
1303
+ pattern = ",".join(current_path.split(",")[:target_depth])
1304
+
1305
+ for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
1306
+ yield pathspec
1275
1307
 
1276
1308
  @property
1277
- def child_tasks(self) -> Iterator["Task"]:
1309
+ def child_task_pathspecs(self) -> Iterator[str]:
1278
1310
  """
1279
- Yield all child tasks of the current task if one exists.
1311
+ Yields pathspecs of all child tasks of the current task.
1280
1312
 
1281
1313
  Yields
1282
1314
  ------
1283
- Task
1284
- Child task of the current task
1315
+ str
1316
+ Pathspec of the child task of the current task
1285
1317
  """
1286
- flow_id, run_id, _, _ = self.path_components
1287
- steps = list(self.parent.child_steps)
1288
- if not steps:
1289
- return []
1318
+ flow_id, run_id, step_name, _ = self.path_components
1319
+ metadata_dict = self.metadata_dict
1320
+ graph_info = self["_graph_info"].data
1290
1321
 
1291
- current_path = self.metadata_dict.get("foreach-execution-path", "")
1322
+ # Get the child steps
1323
+ steps = graph_info["steps"][step_name]["next"]
1324
+
1325
+ node_type = graph_info["steps"][step_name]["type"]
1326
+ metadata_key = "foreach-execution-path"
1327
+ current_path = metadata_dict.get(metadata_key)
1292
1328
 
1293
1329
  if len(steps) > 1:
1294
1330
  # Static split - use exact path matching
1295
1331
  pattern = current_path or ".*"
1296
- yield from self._iter_matching_tasks(
1297
- steps, "foreach-execution-path", pattern
1298
- )
1299
- return
1300
-
1301
- # Handle single step case
1302
- target_task = Step(
1303
- f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
1304
- ).task
1305
- target_path = target_task.metadata_dict.get("foreach-execution-path")
1306
-
1307
- if not target_path or not current_path:
1308
- # (Current task, "A:10") and (Child task, "")
1309
- # Pattern: ".*"
1310
- pattern = ".*"
1311
1332
  else:
1312
- current_depth = len(current_path.split(","))
1313
- target_depth = len(target_path.split(","))
1314
-
1315
- if current_depth < target_depth:
1316
- # Foreach split
1317
- # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
1318
- # Pattern: "A:10,B:13,.*"
1319
- pattern = f"{current_path},.*"
1333
+ if not steps:
1334
+ return # No child steps, yield nothing
1335
+
1336
+ if not current_path:
1337
+ # Current task is not part of a foreach
1338
+ # Pattern: ".*"
1339
+ pattern = ".*"
1320
1340
  else:
1321
- # Foreach join or linear step
1322
- # Option 1:
1323
- # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
1324
- # Option 2:
1325
- # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
1326
- # Pattern: "A:10,B:13"
1327
- pattern = ",".join(current_path.split(",")[:target_depth])
1328
-
1329
- yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)
1341
+ current_depth = len(current_path.split(","))
1342
+ if node_type == "split-foreach" or node_type == "split-parallel":
1343
+ # Foreach split
1344
+ # (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
1345
+ # Pattern: "A:10,B:13,.*"
1346
+ pattern = f"{current_path},.*"
1347
+ else:
1348
+ # Foreach join or linear step
1349
+ # Pattern: "A:10,B:13"
1350
+ child_step_type = graph_info["steps"][steps[0]]["type"]
1351
+
1352
+ # We need to know if the child step is a foreach join or a static join
1353
+ child_step_prev_steps = self._get_previous_steps(
1354
+ graph_info, steps[0]
1355
+ )
1356
+ if len(child_step_prev_steps) > 1:
1357
+ child_step_type = "static-join"
1358
+ target_depth = current_depth
1359
+ if child_step_type == "join" and current_depth == 1:
1360
+ # (Current task, "A:10") and (Child task, "")
1361
+ pattern = ".*"
1362
+ else:
1363
+ # (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
1364
+ # (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
1365
+ if child_step_type == "join":
1366
+ target_depth = current_depth - 1
1367
+ pattern = ",".join(current_path.split(",")[:target_depth])
1368
+
1369
+ for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
1370
+ yield pathspec
1371
+
1372
+ @property
1373
+ def parent_tasks(self) -> Iterator["Task"]:
1374
+ """
1375
+ Yields all parent tasks of the current task if one exists.
1376
+
1377
+ Yields
1378
+ ------
1379
+ Task
1380
+ Parent task of the current task
1381
+ """
1382
+ parent_task_pathspecs = self.parent_task_pathspecs
1383
+ for pathspec in parent_task_pathspecs:
1384
+ yield Task(pathspec=pathspec, _namespace_check=False)
1385
+
1386
+ @property
1387
+ def child_tasks(self) -> Iterator["Task"]:
1388
+ """
1389
+ Yields all child tasks of the current task if one exists.
1390
+
1391
+ Yields
1392
+ ------
1393
+ Task
1394
+ Child task of the current task
1395
+ """
1396
+ for pathspec in self.child_task_pathspecs:
1397
+ yield Task(pathspec=pathspec, _namespace_check=False)
1330
1398
 
1331
1399
  @property
1332
1400
  def metadata(self) -> List[Metadata]:
@@ -1,5 +1,6 @@
1
1
  from __future__ import print_function
2
2
  from collections import OrderedDict
3
+ import json
3
4
  import os
4
5
  import sys
5
6
  import time
@@ -10,13 +11,14 @@ from urllib.parse import urlparse
10
11
 
11
12
  from metaflow.datastore import FlowDataStore
12
13
  from metaflow.datastore.content_addressed_store import BlobCache
14
+ from metaflow.datastore.flow_datastore import MetadataCache
13
15
  from metaflow.exception import MetaflowException
14
16
  from metaflow.metaflow_config import (
15
17
  CLIENT_CACHE_PATH,
16
18
  CLIENT_CACHE_MAX_SIZE,
17
19
  CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT,
18
- CLIENT_CACHE_MAX_TASKDATASTORE_COUNT,
19
20
  )
21
+ from metaflow.metaflow_profile import from_start
20
22
 
21
23
  from metaflow.plugins import DATASTORES
22
24
 
@@ -63,8 +65,8 @@ class FileCache(object):
63
65
  # when querying for sizes of artifacts. Once we have queried for the size
64
66
  # of one artifact in a TaskDatastore, caching this means that any
65
67
  # queries on that same TaskDatastore will be quick (since we already
66
- # have all the metadata)
67
- self._task_metadata_caches = OrderedDict()
68
+ # have all the metadata). We keep track of this in a file so it persists
69
+ # across processes.
68
70
 
69
71
  @property
70
72
  def cache_dir(self):
@@ -87,7 +89,7 @@ class FileCache(object):
87
89
  ):
88
90
  ds_cls = self._get_datastore_storage_impl(ds_type)
89
91
  ds_root = ds_cls.path_join(*ds_cls.path_split(location)[:-5])
90
- cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
92
+ cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
91
93
 
92
94
  token = (
93
95
  "%s.cached"
@@ -311,13 +313,13 @@ class FileCache(object):
311
313
  self._objects = sorted(objects, reverse=False)
312
314
 
313
315
  @staticmethod
314
- def _flow_ds_id(ds_type, ds_root, flow_name):
316
+ def flow_ds_id(ds_type, ds_root, flow_name):
315
317
  p = urlparse(ds_root)
316
318
  sanitized_root = (p.netloc + p.path).replace("/", "_")
317
319
  return ".".join([ds_type, sanitized_root, flow_name])
318
320
 
319
321
  @staticmethod
320
- def _task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
322
+ def task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
321
323
  p = urlparse(ds_root)
322
324
  sanitized_root = (p.netloc + p.path).replace("/", "_")
323
325
  return ".".join(
@@ -365,7 +367,7 @@ class FileCache(object):
365
367
  return storage_impl[0]
366
368
 
367
369
  def _get_flow_datastore(self, ds_type, ds_root, flow_name):
368
- cache_id = self._flow_ds_id(ds_type, ds_root, flow_name)
370
+ cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
369
371
  cached_flow_datastore = self._store_caches.get(cache_id)
370
372
 
371
373
  if cached_flow_datastore:
@@ -380,9 +382,14 @@ class FileCache(object):
380
382
  ds_root=ds_root,
381
383
  )
382
384
  blob_cache = self._blob_caches.setdefault(
383
- cache_id, FileBlobCache(self, cache_id)
385
+ cache_id,
386
+ (
387
+ FileBlobCache(self, cache_id),
388
+ TaskMetadataCache(self, ds_type, ds_root, flow_name),
389
+ ),
384
390
  )
385
- cached_flow_datastore.ca_store.set_blob_cache(blob_cache)
391
+ cached_flow_datastore.ca_store.set_blob_cache(blob_cache[0])
392
+ cached_flow_datastore.set_metadata_cache(blob_cache[1])
386
393
  self._store_caches[cache_id] = cached_flow_datastore
387
394
  if len(self._store_caches) > CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT:
388
395
  cache_id_to_remove, _ = self._store_caches.popitem(last=False)
@@ -393,32 +400,52 @@ class FileCache(object):
393
400
  self, ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
394
401
  ):
395
402
  flow_ds = self._get_flow_datastore(ds_type, ds_root, flow_name)
396
- cached_metadata = None
397
- if attempt is not None:
398
- cache_id = self._task_ds_id(
399
- ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
403
+
404
+ return flow_ds.get_task_datastore(run_id, step_name, task_id, attempt=attempt)
405
+
406
+
407
+ class TaskMetadataCache(MetadataCache):
408
+ def __init__(self, filecache, ds_type, ds_root, flow_name):
409
+ self._filecache = filecache
410
+ self._ds_type = ds_type
411
+ self._ds_root = ds_root
412
+ self._flow_name = flow_name
413
+
414
+ def _path(self, run_id, step_name, task_id, attempt):
415
+ if attempt is None:
416
+ raise MetaflowException(
417
+ "Attempt number must be specified to use task metadata cache. Raise an issue "
418
+ "on Metaflow GitHub if you see this message.",
400
419
  )
401
- cached_metadata = self._task_metadata_caches.get(cache_id)
402
- if cached_metadata:
403
- od_move_to_end(self._task_metadata_caches, cache_id)
404
- return flow_ds.get_task_datastore(
405
- run_id,
406
- step_name,
407
- task_id,
408
- attempt=attempt,
409
- data_metadata=cached_metadata,
410
- )
411
- # If we are here, we either have attempt=None or nothing in the cache
412
- task_ds = flow_ds.get_task_datastore(
413
- run_id, step_name, task_id, attempt=attempt
420
+ cache_id = self._filecache.task_ds_id(
421
+ self._ds_type,
422
+ self._ds_root,
423
+ self._flow_name,
424
+ run_id,
425
+ step_name,
426
+ task_id,
427
+ attempt,
414
428
  )
415
- cache_id = self._task_ds_id(
416
- ds_type, ds_root, flow_name, run_id, step_name, task_id, task_ds.attempt
429
+ token = (
430
+ "%s.cached"
431
+ % sha1(
432
+ os.path.join(
433
+ run_id, step_name, task_id, str(attempt), "metadata"
434
+ ).encode("utf-8")
435
+ ).hexdigest()
436
+ )
437
+ return os.path.join(self._filecache.cache_dir, cache_id, token[:2], token)
438
+
439
+ def load_metadata(self, run_id, step_name, task_id, attempt):
440
+ d = self._filecache.read_file(self._path(run_id, step_name, task_id, attempt))
441
+ if d:
442
+ return json.loads(d)
443
+
444
+ def store_metadata(self, run_id, step_name, task_id, attempt, metadata_dict):
445
+ self._filecache.create_file(
446
+ self._path(run_id, step_name, task_id, attempt),
447
+ json.dumps(metadata_dict).encode("utf-8"),
417
448
  )
418
- self._task_metadata_caches[cache_id] = task_ds.ds_metadata
419
- if len(self._task_metadata_caches) > CLIENT_CACHE_MAX_TASKDATASTORE_COUNT:
420
- self._task_metadata_caches.popitem(last=False)
421
- return task_ds
422
449
 
423
450
 
424
451
  class FileBlobCache(BlobCache):
@@ -6,6 +6,7 @@ from tempfile import TemporaryDirectory
6
6
  from typing import Any, Callable, List, Mapping, Optional, cast
7
7
 
8
8
  from metaflow import Run
9
+ from metaflow.util import walk_without_cycles
9
10
  from metaflow._vendor import click
10
11
  from metaflow.cli import echo_always
11
12
 
@@ -51,7 +52,7 @@ def perform_diff(
51
52
  target_dir = os.getcwd()
52
53
 
53
54
  diffs = []
54
- for dirpath, dirnames, filenames in os.walk(source_dir, followlinks=True):
55
+ for dirpath, _, filenames in walk_without_cycles(source_dir):
55
56
  for fname in filenames:
56
57
  # NOTE: the paths below need to be set up carefully
57
58
  # for the `patch` command to work. Better not to touch