ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. metaflow/__init__.py +10 -3
  2. metaflow/_vendor/imghdr/__init__.py +186 -0
  3. metaflow/_vendor/yaml/__init__.py +427 -0
  4. metaflow/_vendor/yaml/composer.py +139 -0
  5. metaflow/_vendor/yaml/constructor.py +748 -0
  6. metaflow/_vendor/yaml/cyaml.py +101 -0
  7. metaflow/_vendor/yaml/dumper.py +62 -0
  8. metaflow/_vendor/yaml/emitter.py +1137 -0
  9. metaflow/_vendor/yaml/error.py +75 -0
  10. metaflow/_vendor/yaml/events.py +86 -0
  11. metaflow/_vendor/yaml/loader.py +63 -0
  12. metaflow/_vendor/yaml/nodes.py +49 -0
  13. metaflow/_vendor/yaml/parser.py +589 -0
  14. metaflow/_vendor/yaml/reader.py +185 -0
  15. metaflow/_vendor/yaml/representer.py +389 -0
  16. metaflow/_vendor/yaml/resolver.py +227 -0
  17. metaflow/_vendor/yaml/scanner.py +1435 -0
  18. metaflow/_vendor/yaml/serializer.py +111 -0
  19. metaflow/_vendor/yaml/tokens.py +104 -0
  20. metaflow/cards.py +4 -0
  21. metaflow/cli.py +125 -21
  22. metaflow/cli_components/init_cmd.py +1 -0
  23. metaflow/cli_components/run_cmds.py +204 -40
  24. metaflow/cli_components/step_cmd.py +160 -4
  25. metaflow/client/__init__.py +1 -0
  26. metaflow/client/core.py +198 -130
  27. metaflow/client/filecache.py +59 -32
  28. metaflow/cmd/code/__init__.py +2 -1
  29. metaflow/cmd/develop/stub_generator.py +49 -18
  30. metaflow/cmd/develop/stubs.py +9 -27
  31. metaflow/cmd/make_wrapper.py +30 -0
  32. metaflow/datastore/__init__.py +1 -0
  33. metaflow/datastore/content_addressed_store.py +40 -9
  34. metaflow/datastore/datastore_set.py +10 -1
  35. metaflow/datastore/flow_datastore.py +124 -4
  36. metaflow/datastore/spin_datastore.py +91 -0
  37. metaflow/datastore/task_datastore.py +92 -6
  38. metaflow/debug.py +5 -0
  39. metaflow/decorators.py +331 -82
  40. metaflow/extension_support/__init__.py +414 -356
  41. metaflow/extension_support/_empty_file.py +2 -2
  42. metaflow/flowspec.py +322 -82
  43. metaflow/graph.py +178 -15
  44. metaflow/includefile.py +25 -3
  45. metaflow/lint.py +94 -3
  46. metaflow/meta_files.py +13 -0
  47. metaflow/metadata_provider/metadata.py +13 -2
  48. metaflow/metaflow_config.py +66 -4
  49. metaflow/metaflow_environment.py +91 -25
  50. metaflow/metaflow_profile.py +18 -0
  51. metaflow/metaflow_version.py +16 -1
  52. metaflow/package/__init__.py +673 -0
  53. metaflow/packaging_sys/__init__.py +880 -0
  54. metaflow/packaging_sys/backend.py +128 -0
  55. metaflow/packaging_sys/distribution_support.py +153 -0
  56. metaflow/packaging_sys/tar_backend.py +99 -0
  57. metaflow/packaging_sys/utils.py +54 -0
  58. metaflow/packaging_sys/v1.py +527 -0
  59. metaflow/parameters.py +6 -2
  60. metaflow/plugins/__init__.py +6 -0
  61. metaflow/plugins/airflow/airflow.py +11 -1
  62. metaflow/plugins/airflow/airflow_cli.py +16 -5
  63. metaflow/plugins/argo/argo_client.py +42 -20
  64. metaflow/plugins/argo/argo_events.py +6 -6
  65. metaflow/plugins/argo/argo_workflows.py +1023 -344
  66. metaflow/plugins/argo/argo_workflows_cli.py +396 -94
  67. metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
  68. metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
  69. metaflow/plugins/argo/capture_error.py +5 -2
  70. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  71. metaflow/plugins/argo/exit_hooks.py +209 -0
  72. metaflow/plugins/argo/param_val.py +19 -0
  73. metaflow/plugins/aws/aws_client.py +6 -0
  74. metaflow/plugins/aws/aws_utils.py +33 -1
  75. metaflow/plugins/aws/batch/batch.py +72 -5
  76. metaflow/plugins/aws/batch/batch_cli.py +24 -3
  77. metaflow/plugins/aws/batch/batch_decorator.py +57 -6
  78. metaflow/plugins/aws/step_functions/step_functions.py +28 -3
  79. metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
  80. metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
  81. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
  82. metaflow/plugins/cards/card_cli.py +20 -1
  83. metaflow/plugins/cards/card_creator.py +24 -1
  84. metaflow/plugins/cards/card_datastore.py +21 -49
  85. metaflow/plugins/cards/card_decorator.py +58 -6
  86. metaflow/plugins/cards/card_modules/basic.py +38 -9
  87. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  88. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  89. metaflow/plugins/cards/card_modules/components.py +592 -3
  90. metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
  91. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  92. metaflow/plugins/cards/card_modules/main.css +1 -0
  93. metaflow/plugins/cards/card_modules/main.js +56 -41
  94. metaflow/plugins/cards/card_modules/test_cards.py +22 -6
  95. metaflow/plugins/cards/component_serializer.py +1 -8
  96. metaflow/plugins/cards/metadata.py +22 -0
  97. metaflow/plugins/catch_decorator.py +9 -0
  98. metaflow/plugins/datastores/local_storage.py +12 -6
  99. metaflow/plugins/datastores/spin_storage.py +12 -0
  100. metaflow/plugins/datatools/s3/s3.py +49 -17
  101. metaflow/plugins/datatools/s3/s3op.py +113 -66
  102. metaflow/plugins/env_escape/client_modules.py +102 -72
  103. metaflow/plugins/events_decorator.py +127 -121
  104. metaflow/plugins/exit_hook/__init__.py +0 -0
  105. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  106. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  107. metaflow/plugins/kubernetes/kubernetes.py +12 -1
  108. metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
  109. metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
  110. metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
  111. metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
  112. metaflow/plugins/metadata_providers/local.py +76 -82
  113. metaflow/plugins/metadata_providers/service.py +13 -9
  114. metaflow/plugins/metadata_providers/spin.py +16 -0
  115. metaflow/plugins/package_cli.py +36 -24
  116. metaflow/plugins/parallel_decorator.py +11 -2
  117. metaflow/plugins/parsers.py +16 -0
  118. metaflow/plugins/pypi/bootstrap.py +7 -1
  119. metaflow/plugins/pypi/conda_decorator.py +41 -82
  120. metaflow/plugins/pypi/conda_environment.py +14 -6
  121. metaflow/plugins/pypi/micromamba.py +9 -1
  122. metaflow/plugins/pypi/pip.py +41 -5
  123. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  124. metaflow/plugins/pypi/utils.py +22 -0
  125. metaflow/plugins/secrets/__init__.py +3 -0
  126. metaflow/plugins/secrets/secrets_decorator.py +14 -178
  127. metaflow/plugins/secrets/secrets_func.py +49 -0
  128. metaflow/plugins/secrets/secrets_spec.py +101 -0
  129. metaflow/plugins/secrets/utils.py +74 -0
  130. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  131. metaflow/plugins/timeout_decorator.py +0 -1
  132. metaflow/plugins/uv/bootstrap.py +29 -1
  133. metaflow/plugins/uv/uv_environment.py +5 -3
  134. metaflow/pylint_wrapper.py +5 -1
  135. metaflow/runner/click_api.py +79 -26
  136. metaflow/runner/deployer.py +208 -6
  137. metaflow/runner/deployer_impl.py +32 -12
  138. metaflow/runner/metaflow_runner.py +266 -33
  139. metaflow/runner/subprocess_manager.py +21 -1
  140. metaflow/runner/utils.py +27 -16
  141. metaflow/runtime.py +660 -66
  142. metaflow/task.py +255 -26
  143. metaflow/user_configs/config_options.py +33 -21
  144. metaflow/user_configs/config_parameters.py +220 -58
  145. metaflow/user_decorators/__init__.py +0 -0
  146. metaflow/user_decorators/common.py +144 -0
  147. metaflow/user_decorators/mutable_flow.py +512 -0
  148. metaflow/user_decorators/mutable_step.py +424 -0
  149. metaflow/user_decorators/user_flow_decorator.py +264 -0
  150. metaflow/user_decorators/user_step_decorator.py +749 -0
  151. metaflow/util.py +197 -7
  152. metaflow/vendor.py +23 -7
  153. metaflow/version.py +1 -1
  154. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
  155. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
  156. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
  157. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
  158. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
  159. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  160. metaflow/_vendor/v3_5/__init__.py +0 -1
  161. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  162. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  163. metaflow/_vendor/v3_5/zipp.py +0 -329
  164. metaflow/info_file.py +0 -25
  165. metaflow/package.py +0 -203
  166. metaflow/user_configs/config_decorators.py +0 -568
  167. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
  168. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
  169. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,6 @@ import pathlib
7
7
  import re
8
8
  import time
9
9
  import typing
10
-
11
10
  from datetime import datetime
12
11
  from io import StringIO
13
12
  from types import ModuleType
@@ -335,6 +334,8 @@ class StubGenerator:
335
334
 
336
335
  # Imports that are needed at the top of the file
337
336
  self._imports = set() # type: Set[str]
337
+
338
+ self._sub_module_imports = set() # type: Set[Tuple[str, str]]``
338
339
  # Typing imports (behind if TYPE_CHECKING) that are needed at the top of the file
339
340
  self._typing_imports = set() # type: Set[str]
340
341
  # Typevars that are defined
@@ -488,9 +489,6 @@ class StubGenerator:
488
489
  self._imports.add(name)
489
490
 
490
491
  def _add_to_typing_check(name, is_module=False):
491
- # if name != self._current_module_name:
492
- # self._typing_imports.add(name)
493
- #
494
492
  if name == "None":
495
493
  return
496
494
  if is_module:
@@ -504,6 +502,24 @@ class StubGenerator:
504
502
  # the current file
505
503
  self._typing_imports.add(splits[0])
506
504
 
505
+ def _format_qualified_class_name(cls: type) -> str:
506
+ """Helper to format a class with its qualified module name"""
507
+ # Special case for NoneType - return None
508
+ if cls.__name__ == "NoneType":
509
+ return "None"
510
+
511
+ module = inspect.getmodule(cls)
512
+ if (
513
+ module
514
+ and module.__name__ != "builtins"
515
+ and module.__name__ != "__main__"
516
+ ):
517
+ module_name = self._get_module_name_alias(module.__name__)
518
+ _add_to_typing_check(module_name, is_module=True)
519
+ return f"{module_name}.{cls.__name__}"
520
+ else:
521
+ return cls.__name__
522
+
507
523
  if isinstance(element, str):
508
524
  # Special case for self referential things (particularly in a class)
509
525
  if element == self._current_name:
@@ -557,19 +573,15 @@ class StubGenerator:
557
573
  return element.__name__
558
574
  elif isinstance(element, type(Ellipsis)):
559
575
  return "..."
560
- # elif (
561
- # isinstance(element, typing._GenericAlias)
562
- # and hasattr(element, "_name")
563
- # and element._name in ("List", "Tuple", "Dict", "Set")
564
- # ):
565
- # # 3.7 has these as _GenericAlias but they don't behave like the ones in 3.10
566
- # _add_to_import("typing")
567
- # return str(element)
568
576
  elif isinstance(element, typing._GenericAlias):
569
577
  # We need to check things recursively in __args__ if it exists
570
578
  args_str = []
571
579
  for arg in getattr(element, "__args__", []):
572
- args_str.append(self._get_element_name_with_module(arg))
580
+ # Special handling for class objects in type arguments
581
+ if isinstance(arg, type):
582
+ args_str.append(_format_qualified_class_name(arg))
583
+ else:
584
+ args_str.append(self._get_element_name_with_module(arg))
573
585
 
574
586
  _add_to_import("typing")
575
587
  if element._name:
@@ -584,12 +596,15 @@ class StubGenerator:
584
596
  args_str = [call_args, args_str[-1]]
585
597
  return "typing.%s[%s]" % (element._name, ", ".join(args_str))
586
598
  else:
587
- return "%s[%s]" % (element.__origin__, ", ".join(args_str))
599
+ # Handle the case where we have a generic type without a _name
600
+ origin = element.__origin__
601
+ if isinstance(origin, type):
602
+ origin_str = _format_qualified_class_name(origin)
603
+ else:
604
+ origin_str = str(origin)
605
+ return "%s[%s]" % (origin_str, ", ".join(args_str))
588
606
  elif isinstance(element, ForwardRef):
589
607
  f_arg = self._get_module_name_alias(element.__forward_arg__)
590
- # if f_arg in ("Run", "Task"): # HACK -- forward references in current.py
591
- # _add_to_import("metaflow")
592
- # f_arg = "metaflow.%s" % f_arg
593
608
  _add_to_typing_check(f_arg)
594
609
  return '"%s"' % f_arg
595
610
  elif inspect.getmodule(element) == inspect.getmodule(typing):
@@ -629,6 +644,21 @@ class StubGenerator:
629
644
  "deployer"
630
645
  ] = (self._current_module_name + "." + name)
631
646
 
647
+ # Handle TypedDict gracefully for Python 3.7 compatibility
648
+ # _TypedDictMeta is not available in Python 3.7
649
+ typed_dict_meta = getattr(typing, "_TypedDictMeta", None)
650
+ if typed_dict_meta is not None and isinstance(clazz, typed_dict_meta):
651
+ self._sub_module_imports.add(("typing", "TypedDict"))
652
+ total_flag = getattr(clazz, "__total__", False)
653
+ buff = StringIO()
654
+ # Emit the TypedDict base and total flag
655
+ buff.write(f"class {name}(TypedDict, total={total_flag}):\n")
656
+ # Write out each field from __annotations__
657
+ for field_name, field_type in clazz.__annotations__.items():
658
+ ann = self._get_element_name_with_module(field_type)
659
+ buff.write(f"{TAB}{field_name}: {ann}\n")
660
+ return buff.getvalue()
661
+
632
662
  buff = StringIO()
633
663
  # Class prototype
634
664
  buff.write("class " + name.split(".")[-1] + "(")
@@ -973,7 +1003,6 @@ class StubGenerator:
973
1003
  ]
974
1004
 
975
1005
  docs = split_docs(raw_doc, section_boundaries)
976
-
977
1006
  parameters, no_arg_version = parse_params_from_doc(docs["param_doc"])
978
1007
 
979
1008
  if docs["add_to_current_doc"]:
@@ -1501,6 +1530,8 @@ class StubGenerator:
1501
1530
  f.write("import " + module + "\n")
1502
1531
  if module == "typing":
1503
1532
  imported_typing = True
1533
+ for module, sub_module in self._sub_module_imports:
1534
+ f.write(f"from {module} import {sub_module}\n")
1504
1535
  if self._typing_imports:
1505
1536
  if not imported_typing:
1506
1537
  f.write("import typing\n")
@@ -12,25 +12,13 @@ from . import develop
12
12
  from .stub_generator import StubGenerator
13
13
 
14
14
  _py_ver = sys.version_info[:2]
15
- _metadata_package = None
16
15
 
17
-
18
- def _check_stubs_supported():
19
- global _metadata_package
20
- if _metadata_package is not None:
21
- return _metadata_package
22
- else:
23
- if _py_ver >= (3, 4):
24
- if _py_ver >= (3, 8):
25
- from importlib import metadata
26
- elif _py_ver >= (3, 7):
27
- from metaflow._vendor.v3_7 import importlib_metadata as metadata
28
- elif _py_ver >= (3, 6):
29
- from metaflow._vendor.v3_6 import importlib_metadata as metadata
30
- else:
31
- from metaflow._vendor.v3_5 import importlib_metadata as metadata
32
- _metadata_package = metadata
33
- return _metadata_package
16
+ if _py_ver >= (3, 8):
17
+ from importlib import metadata
18
+ elif _py_ver >= (3, 7):
19
+ from metaflow._vendor.v3_7 import importlib_metadata as metadata
20
+ else:
21
+ from metaflow._vendor.v3_6 import importlib_metadata as metadata
34
22
 
35
23
 
36
24
  @develop.group(short_help="Stubs management")
@@ -45,12 +33,6 @@ def stubs(ctx: Any):
45
33
  This CLI provides utilities to check and generate stubs for your current Metaflow
46
34
  installation.
47
35
  """
48
- if _check_stubs_supported() is None:
49
- raise click.UsageError(
50
- "Building and installing stubs are not supported on Python %d.%d "
51
- "(3.4 minimum required)" % _py_ver,
52
- ctx=ctx,
53
- )
54
36
 
55
37
 
56
38
  @stubs.command(short_help="Check validity of stubs")
@@ -187,7 +169,7 @@ setup(
187
169
  packages=find_namespace_packages(),
188
170
  package_data={{"metaflow-stubs": ["generated_for.txt", "py.typed", "**/*.pyi"]}},
189
171
  install_requires=["metaflow=={mf_version}"],
190
- python_requires=">=3.5.2",
172
+ python_requires=">=3.6.1",
191
173
  )
192
174
  """
193
175
  )
@@ -330,14 +312,14 @@ def get_packages_for_stubs() -> Tuple[List[Tuple[str, str]], List[str]]:
330
312
  # some reason it shows up multiple times.
331
313
  interesting_dists = [
332
314
  d
333
- for d in _metadata_package.distributions()
315
+ for d in metadata.distributions()
334
316
  if any(
335
317
  [
336
318
  p == "metaflow-stubs"
337
319
  for p in (d.read_text("top_level.txt") or "").split()
338
320
  ]
339
321
  )
340
- and isinstance(d, _metadata_package.PathDistribution)
322
+ and isinstance(d, metadata.PathDistribution)
341
323
  ]
342
324
 
343
325
  for dist in interesting_dists:
@@ -28,6 +28,36 @@ def find_makefile():
28
28
  if makefile_candidate.is_file():
29
29
  return makefile_candidate
30
30
 
31
+ # 4) When developing, Metaflow might be installed with --editable, which means the devtools will not be located within site-packages.
32
+ # We read the actual location from package metadata in this case, but only do this heavier operation if the above lookups fail.
33
+ try:
34
+ import json
35
+ from importlib.metadata import Distribution
36
+
37
+ direct_url = Distribution.from_name("metaflow").read_text("direct_url.json")
38
+ if direct_url:
39
+ content = json.loads(direct_url)
40
+ url = content.get("url", "")
41
+ if not url.startswith("file://"):
42
+ return None
43
+
44
+ makefile_candidate = (
45
+ Path(url.replace("file://", "")) / "devtools" / "Makefile"
46
+ )
47
+ if makefile_candidate.is_file():
48
+ return makefile_candidate
49
+ else:
50
+ # No dist metadata found. This is tied to the version of pip being used
51
+ # Do not bother with .egg-link installs due to the handling of the file contents being a headache due to lack of a unified spec.
52
+ print(
53
+ "Could not locate an installation of Metaflow. No package metadata found."
54
+ )
55
+ print(
56
+ "If Metaflow is installed as editable, try upgrading the version of pip and reinstalling in order to generate proper package metadata.\n"
57
+ )
58
+ except Exception:
59
+ return None
60
+
31
61
  return None
32
62
 
33
63
 
@@ -2,3 +2,4 @@ from .inputs import Inputs
2
2
  from .flow_datastore import FlowDataStore
3
3
  from .datastore_set import TaskDataStoreSet
4
4
  from .task_datastore import TaskDataStore
5
+ from .spin_datastore import SpinTaskDatastore
@@ -38,7 +38,7 @@ class ContentAddressedStore(object):
38
38
  def set_blob_cache(self, blob_cache):
39
39
  self._blob_cache = blob_cache
40
40
 
41
- def save_blobs(self, blob_iter, raw=False, len_hint=0):
41
+ def save_blobs(self, blob_iter, raw=False, len_hint=0, is_transfer=False):
42
42
  """
43
43
  Saves blobs of data to the datastore
44
44
 
@@ -60,11 +60,16 @@ class ContentAddressedStore(object):
60
60
 
61
61
  Parameters
62
62
  ----------
63
- blob_iter : Iterator over bytes objects to save
64
- raw : bool, optional
63
+ blob_iter : Iterator
64
+ Iterator over bytes objects to save
65
+ raw : bool, default False
65
66
  Whether to save the bytes directly or process them, by default False
66
- len_hint : Hint of the number of blobs that will be produced by the
67
+ len_hint : int, default 0
68
+ Hint of the number of blobs that will be produced by the
67
69
  iterator, by default 0
70
+ is_transfer : bool, default False
71
+ If True, this indicates we are saving blobs directly from the output of another
72
+ content addressed store's
68
73
 
69
74
  Returns
70
75
  -------
@@ -76,6 +81,20 @@ class ContentAddressedStore(object):
76
81
 
77
82
  def packing_iter():
78
83
  for blob in blob_iter:
84
+ if is_transfer:
85
+ key, blob_data, meta = blob
86
+ path = self._storage_impl.path_join(self._prefix, key[:2], key)
87
+ # Transfer data is always raw/decompressed, so mark it as such
88
+ meta_corrected = {"cas_raw": True, "cas_version": 1}
89
+
90
+ results.append(
91
+ self.save_blobs_result(
92
+ uri=self._storage_impl.full_uri(path),
93
+ key=key,
94
+ )
95
+ )
96
+ yield path, (BytesIO(blob_data), meta_corrected)
97
+ continue
79
98
  sha = sha1(blob).hexdigest()
80
99
  path = self._storage_impl.path_join(self._prefix, sha[:2], sha)
81
100
  results.append(
@@ -100,7 +119,7 @@ class ContentAddressedStore(object):
100
119
  self._storage_impl.save_bytes(packing_iter(), overwrite=True, len_hint=len_hint)
101
120
  return results
102
121
 
103
- def load_blobs(self, keys, force_raw=False):
122
+ def load_blobs(self, keys, force_raw=False, is_transfer=False):
104
123
  """
105
124
  Mirror function of save_blobs
106
125
 
@@ -111,15 +130,20 @@ class ContentAddressedStore(object):
111
130
  ----------
112
131
  keys : List of string
113
132
  Key describing the object to load
114
- force_raw : bool, optional
133
+ force_raw : bool, default False
115
134
  Support for backward compatibility with previous datastores. If
116
135
  True, this will force the key to be loaded as is (raw). By default,
117
136
  False
137
+ is_transfer : bool, default False
138
+ If True, this indicates we are loading blobs to transfer them directly
139
+ to another datastore. We will, in this case, also transfer the metadata
140
+ and do minimal processing. This is for internal use only.
118
141
 
119
142
  Returns
120
143
  -------
121
144
  Returns an iterator of (string, bytes) tuples; the iterator may return keys
122
- in a different order than were passed in.
145
+ in a different order than were passed in. If is_transfer is True, the tuple
146
+ has three elements with the third one being the metadata.
123
147
  """
124
148
  load_paths = []
125
149
  for key in keys:
@@ -127,7 +151,11 @@ class ContentAddressedStore(object):
127
151
  if self._blob_cache:
128
152
  blob = self._blob_cache.load_key(key)
129
153
  if blob is not None:
130
- yield key, blob
154
+ if is_transfer:
155
+ # Cached blobs are decompressed/processed bytes regardless of original format
156
+ yield key, blob, {"cas_raw": False, "cas_version": 1}
157
+ else:
158
+ yield key, blob
131
159
  else:
132
160
  path = self._storage_impl.path_join(self._prefix, key[:2], key)
133
161
  load_paths.append((key, path))
@@ -169,7 +197,10 @@ class ContentAddressedStore(object):
169
197
  if self._blob_cache:
170
198
  self._blob_cache.store_key(key, blob)
171
199
 
172
- yield key, blob
200
+ if is_transfer:
201
+ yield key, blob, meta # Preserve exact original metadata from storage
202
+ else:
203
+ yield key, blob
173
204
 
174
205
  def _unpack_backward_compatible(self, blob):
175
206
  # This is the backward compatible unpack
@@ -21,9 +21,18 @@ class TaskDataStoreSet(object):
21
21
  pathspecs=None,
22
22
  prefetch_data_artifacts=None,
23
23
  allow_not_done=False,
24
+ join_type=None,
25
+ orig_flow_datastore=None,
26
+ spin_artifacts=None,
24
27
  ):
25
28
  self.task_datastores = flow_datastore.get_task_datastores(
26
- run_id, steps=steps, pathspecs=pathspecs, allow_not_done=allow_not_done
29
+ run_id,
30
+ steps=steps,
31
+ pathspecs=pathspecs,
32
+ allow_not_done=allow_not_done,
33
+ join_type=join_type,
34
+ orig_flow_datastore=orig_flow_datastore,
35
+ spin_artifacts=spin_artifacts,
27
36
  )
28
37
 
29
38
  if prefetch_data_artifacts:
@@ -1,10 +1,13 @@
1
1
  import itertools
2
2
  import json
3
+ from abc import ABC, abstractmethod
3
4
 
4
5
  from .. import metaflow_config
5
6
 
6
7
  from .content_addressed_store import ContentAddressedStore
7
8
  from .task_datastore import TaskDataStore
9
+ from .spin_datastore import SpinTaskDatastore
10
+ from ..metaflow_profile import from_start
8
11
 
9
12
 
10
13
  class FlowDataStore(object):
@@ -63,10 +66,16 @@ class FlowDataStore(object):
63
66
  self._storage_impl.path_join(self.flow_name, "data"), self._storage_impl
64
67
  )
65
68
 
69
+ # Private
70
+ self._metadata_cache = None
71
+
66
72
  @property
67
73
  def datastore_root(self):
68
74
  return self._storage_impl.datastore_root
69
75
 
76
+ def set_metadata_cache(self, cache):
77
+ self._metadata_cache = cache
78
+
70
79
  def get_task_datastores(
71
80
  self,
72
81
  run_id=None,
@@ -76,6 +85,9 @@ class FlowDataStore(object):
76
85
  attempt=None,
77
86
  include_prior=False,
78
87
  mode="r",
88
+ join_type=None,
89
+ orig_flow_datastore=None,
90
+ spin_artifacts=None,
79
91
  ):
80
92
  """
81
93
  Return a list of TaskDataStore for a subset of the tasks.
@@ -95,7 +107,7 @@ class FlowDataStore(object):
95
107
  Steps to get the tasks from. If run_id is specified, this
96
108
  must also be specified, by default None
97
109
  pathspecs : List[str], optional
98
- Full task specs (run_id/step_name/task_id). Can be used instead of
110
+ Full task specs (run_id/step_name/task_id[/attempt]). Can be used instead of
99
111
  specifying run_id and steps, by default None
100
112
  allow_not_done : bool, optional
101
113
  If True, returns the latest attempt of a task even if that attempt
@@ -106,6 +118,16 @@ class FlowDataStore(object):
106
118
  If True, returns all attempts up to and including attempt.
107
119
  mode : str, default "r"
108
120
  Mode to initialize the returned TaskDataStores in.
121
+ join_type : str, optional, default None
122
+ If specified, the join type for the task. This is used to determine
123
+ the user specified artifacts for the task in case of a spin task.
124
+ orig_flow_datastore : MetadataProvider, optional, default None
125
+ The metadata provider in case of a spin task. If provided, the
126
+ returned TaskDataStore will be a SpinTaskDatastore instead of a
127
+ TaskDataStore.
128
+ spin_artifacts : Dict[str, Any], optional, default None
129
+ Artifacts provided by user that can override the artifacts fetched via the
130
+ spin pathspec.
109
131
 
110
132
  Returns
111
133
  -------
@@ -145,7 +167,14 @@ class FlowDataStore(object):
145
167
  if attempt is not None and attempt <= metaflow_config.MAX_ATTEMPTS - 1:
146
168
  attempt_range = range(attempt + 1) if include_prior else [attempt]
147
169
  for task_url in task_urls:
148
- for attempt in attempt_range:
170
+ # task_url can have a trailing slash, so strip this to avoid empty strings in the split
171
+ task_splits = task_url.rstrip("/").split("/")
172
+ # Usually it is flow, run, step, task (so 4 components) -- if we have a
173
+ # fifth one, there is a specific attempt number listed as well.
174
+ task_attempt_range = attempt_range
175
+ if len(task_splits) == 5:
176
+ task_attempt_range = [int(task_splits[4])]
177
+ for attempt in task_attempt_range:
149
178
  for suffix in [
150
179
  TaskDataStore.METADATA_DATA_SUFFIX,
151
180
  TaskDataStore.METADATA_ATTEMPT_SUFFIX,
@@ -198,7 +227,18 @@ class FlowDataStore(object):
198
227
  else (latest_started_attempts & done_attempts)
199
228
  )
200
229
  latest_to_fetch = [
201
- (v[0], v[1], v[2], v[3], data_objs.get(v), mode, allow_not_done)
230
+ (
231
+ v[0],
232
+ v[1],
233
+ v[2],
234
+ v[3],
235
+ data_objs.get(v),
236
+ mode,
237
+ allow_not_done,
238
+ join_type,
239
+ orig_flow_datastore,
240
+ spin_artifacts,
241
+ )
202
242
  for v in latest_to_fetch
203
243
  ]
204
244
  return list(itertools.starmap(self.get_task_datastore, latest_to_fetch))
@@ -212,8 +252,63 @@ class FlowDataStore(object):
212
252
  data_metadata=None,
213
253
  mode="r",
214
254
  allow_not_done=False,
255
+ join_type=None,
256
+ orig_flow_datastore=None,
257
+ spin_artifacts=None,
258
+ persist=True,
215
259
  ):
216
- return TaskDataStore(
260
+ if orig_flow_datastore is not None:
261
+ # In spin step subprocess, use SpinTaskDatastore for accessing artifacts
262
+ if join_type is not None:
263
+ # If join_type is specified, we need to use the artifacts corresponding
264
+ # to that particular join index, specified by the parent task pathspec.
265
+ spin_artifacts = spin_artifacts.get(
266
+ f"{run_id}/{step_name}/{task_id}", {}
267
+ )
268
+ from_start(
269
+ "FlowDataStore: get_task_datastore for spin task for type %s %s metadata"
270
+ % (self.TYPE, "without" if data_metadata is None else "with")
271
+ )
272
+ # Get the task datastore for the spun task.
273
+ orig_datastore = orig_flow_datastore.get_task_datastore(
274
+ run_id,
275
+ step_name,
276
+ task_id,
277
+ attempt=attempt,
278
+ data_metadata=data_metadata,
279
+ mode=mode,
280
+ allow_not_done=allow_not_done,
281
+ persist=persist,
282
+ )
283
+
284
+ return SpinTaskDatastore(
285
+ self.flow_name,
286
+ run_id,
287
+ step_name,
288
+ task_id,
289
+ orig_datastore,
290
+ spin_artifacts,
291
+ )
292
+
293
+ cache_hit = False
294
+ if (
295
+ self._metadata_cache is not None
296
+ and data_metadata is None
297
+ and attempt is not None
298
+ and allow_not_done is False
299
+ ):
300
+ # If we have a metadata cache, we can try to load the metadata
301
+ # from the cache if it is not provided.
302
+ data_metadata = self._metadata_cache.load_metadata(
303
+ run_id, step_name, task_id, attempt
304
+ )
305
+ cache_hit = data_metadata is not None
306
+
307
+ from_start(
308
+ "FlowDataStore: get_task_datastore for regular task for type %s %s metadata"
309
+ % (self.TYPE, "without" if data_metadata is None else "with")
310
+ )
311
+ task_datastore = TaskDataStore(
217
312
  self,
218
313
  run_id,
219
314
  step_name,
@@ -222,8 +317,23 @@ class FlowDataStore(object):
222
317
  data_metadata=data_metadata,
223
318
  mode=mode,
224
319
  allow_not_done=allow_not_done,
320
+ persist=persist,
225
321
  )
226
322
 
323
+ # Only persist in cache if it is non-changing (so done only) and we have
324
+ # a non-None attempt
325
+ if (
326
+ not cache_hit
327
+ and self._metadata_cache is not None
328
+ and allow_not_done is False
329
+ and attempt is not None
330
+ ):
331
+ self._metadata_cache.store_metadata(
332
+ run_id, step_name, task_id, attempt, task_datastore.ds_metadata
333
+ )
334
+
335
+ return task_datastore
336
+
227
337
  def save_data(self, data_iter, len_hint=0):
228
338
  """Saves data to the underlying content-addressed store
229
339
 
@@ -265,3 +375,13 @@ class FlowDataStore(object):
265
375
  """
266
376
  for key, blob in self.ca_store.load_blobs(keys, force_raw=force_raw):
267
377
  yield key, blob
378
+
379
+
380
+ class MetadataCache(ABC):
381
+ @abstractmethod
382
+ def load_metadata(self, run_id, step_name, task_id, attempt):
383
+ raise NotImplementedError()
384
+
385
+ @abstractmethod
386
+ def store_metadata(self, run_id, step_name, task_id, attempt, metadata_dict):
387
+ raise NotImplementedError()
@@ -0,0 +1,91 @@
1
+ from typing import Dict, Any
2
+ from .task_datastore import TaskDataStore, require_mode
3
+ from ..metaflow_profile import from_start
4
+
5
+
6
+ class SpinTaskDatastore(object):
7
+ def __init__(
8
+ self,
9
+ flow_name: str,
10
+ run_id: str,
11
+ step_name: str,
12
+ task_id: str,
13
+ orig_datastore: TaskDataStore,
14
+ spin_artifacts: Dict[str, Any],
15
+ ):
16
+ """
17
+ SpinTaskDatastore is a datastore for a task that is used to retrieve
18
+ artifacts and attributes for a spin step. It uses the task pathspec
19
+ from a previous execution of the step to access the artifacts and attributes.
20
+
21
+ Parameters:
22
+ -----------
23
+ flow_name : str
24
+ Name of the flow
25
+ run_id : str
26
+ Run ID of the flow
27
+ step_name : str
28
+ Name of the step
29
+ task_id : str
30
+ Task ID of the step
31
+ orig_datastore : TaskDataStore
32
+ The datastore for the underlying task that is being spun.
33
+ spin_artifacts : Dict[str, Any]
34
+ User provided artifacts that are to be used in the spin task. This is a dictionary
35
+ where keys are artifact names and values are the actual data or metadata.
36
+ """
37
+ self.flow_name = flow_name
38
+ self.run_id = run_id
39
+ self.step_name = step_name
40
+ self.task_id = task_id
41
+ self.orig_datastore = orig_datastore
42
+ self.spin_artifacts = spin_artifacts
43
+ self._task = None
44
+
45
+ # Update _objects and _info in order to persist artifacts
46
+ # See `persist` method in `TaskDatastore` for more details
47
+ self._objects = self.orig_datastore._objects.copy()
48
+ self._info = self.orig_datastore._info.copy()
49
+
50
+ # We strip out some of the control ones
51
+ for key in ("_transition",):
52
+ if key in self._objects:
53
+ del self._objects[key]
54
+ del self._info[key]
55
+
56
+ from_start("SpinTaskDatastore: Initialized artifacts")
57
+
58
+ @require_mode(None)
59
+ def __getitem__(self, name):
60
+ try:
61
+ # Check if it's an artifact in the spin_artifacts
62
+ return self.spin_artifacts[name]
63
+ except KeyError:
64
+ try:
65
+ # Check if it's an attribute of the task
66
+ # _foreach_stack, _foreach_index, ...
67
+ return self.orig_datastore[name]
68
+ except (KeyError, AttributeError) as e:
69
+ raise KeyError(
70
+ f"Attribute '{name}' not found in the previous execution "
71
+ f"of the tasks for `{self.step_name}`."
72
+ ) from e
73
+
74
+ @require_mode(None)
75
+ def is_none(self, name):
76
+ val = self.__getitem__(name)
77
+ return val is None
78
+
79
+ @require_mode(None)
80
+ def __contains__(self, name):
81
+ try:
82
+ _ = self.__getitem__(name)
83
+ return True
84
+ except KeyError:
85
+ return False
86
+
87
+ @require_mode(None)
88
+ def items(self):
89
+ if self._objects:
90
+ return self._objects.items()
91
+ return {}