ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. metaflow/__init__.py +10 -3
  2. metaflow/_vendor/imghdr/__init__.py +186 -0
  3. metaflow/_vendor/yaml/__init__.py +427 -0
  4. metaflow/_vendor/yaml/composer.py +139 -0
  5. metaflow/_vendor/yaml/constructor.py +748 -0
  6. metaflow/_vendor/yaml/cyaml.py +101 -0
  7. metaflow/_vendor/yaml/dumper.py +62 -0
  8. metaflow/_vendor/yaml/emitter.py +1137 -0
  9. metaflow/_vendor/yaml/error.py +75 -0
  10. metaflow/_vendor/yaml/events.py +86 -0
  11. metaflow/_vendor/yaml/loader.py +63 -0
  12. metaflow/_vendor/yaml/nodes.py +49 -0
  13. metaflow/_vendor/yaml/parser.py +589 -0
  14. metaflow/_vendor/yaml/reader.py +185 -0
  15. metaflow/_vendor/yaml/representer.py +389 -0
  16. metaflow/_vendor/yaml/resolver.py +227 -0
  17. metaflow/_vendor/yaml/scanner.py +1435 -0
  18. metaflow/_vendor/yaml/serializer.py +111 -0
  19. metaflow/_vendor/yaml/tokens.py +104 -0
  20. metaflow/cards.py +4 -0
  21. metaflow/cli.py +125 -21
  22. metaflow/cli_components/init_cmd.py +1 -0
  23. metaflow/cli_components/run_cmds.py +204 -40
  24. metaflow/cli_components/step_cmd.py +160 -4
  25. metaflow/client/__init__.py +1 -0
  26. metaflow/client/core.py +198 -130
  27. metaflow/client/filecache.py +59 -32
  28. metaflow/cmd/code/__init__.py +2 -1
  29. metaflow/cmd/develop/stub_generator.py +49 -18
  30. metaflow/cmd/develop/stubs.py +9 -27
  31. metaflow/cmd/make_wrapper.py +30 -0
  32. metaflow/datastore/__init__.py +1 -0
  33. metaflow/datastore/content_addressed_store.py +40 -9
  34. metaflow/datastore/datastore_set.py +10 -1
  35. metaflow/datastore/flow_datastore.py +124 -4
  36. metaflow/datastore/spin_datastore.py +91 -0
  37. metaflow/datastore/task_datastore.py +92 -6
  38. metaflow/debug.py +5 -0
  39. metaflow/decorators.py +331 -82
  40. metaflow/extension_support/__init__.py +414 -356
  41. metaflow/extension_support/_empty_file.py +2 -2
  42. metaflow/flowspec.py +322 -82
  43. metaflow/graph.py +178 -15
  44. metaflow/includefile.py +25 -3
  45. metaflow/lint.py +94 -3
  46. metaflow/meta_files.py +13 -0
  47. metaflow/metadata_provider/metadata.py +13 -2
  48. metaflow/metaflow_config.py +66 -4
  49. metaflow/metaflow_environment.py +91 -25
  50. metaflow/metaflow_profile.py +18 -0
  51. metaflow/metaflow_version.py +16 -1
  52. metaflow/package/__init__.py +673 -0
  53. metaflow/packaging_sys/__init__.py +880 -0
  54. metaflow/packaging_sys/backend.py +128 -0
  55. metaflow/packaging_sys/distribution_support.py +153 -0
  56. metaflow/packaging_sys/tar_backend.py +99 -0
  57. metaflow/packaging_sys/utils.py +54 -0
  58. metaflow/packaging_sys/v1.py +527 -0
  59. metaflow/parameters.py +6 -2
  60. metaflow/plugins/__init__.py +6 -0
  61. metaflow/plugins/airflow/airflow.py +11 -1
  62. metaflow/plugins/airflow/airflow_cli.py +16 -5
  63. metaflow/plugins/argo/argo_client.py +42 -20
  64. metaflow/plugins/argo/argo_events.py +6 -6
  65. metaflow/plugins/argo/argo_workflows.py +1023 -344
  66. metaflow/plugins/argo/argo_workflows_cli.py +396 -94
  67. metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
  68. metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
  69. metaflow/plugins/argo/capture_error.py +5 -2
  70. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  71. metaflow/plugins/argo/exit_hooks.py +209 -0
  72. metaflow/plugins/argo/param_val.py +19 -0
  73. metaflow/plugins/aws/aws_client.py +6 -0
  74. metaflow/plugins/aws/aws_utils.py +33 -1
  75. metaflow/plugins/aws/batch/batch.py +72 -5
  76. metaflow/plugins/aws/batch/batch_cli.py +24 -3
  77. metaflow/plugins/aws/batch/batch_decorator.py +57 -6
  78. metaflow/plugins/aws/step_functions/step_functions.py +28 -3
  79. metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
  80. metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
  81. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
  82. metaflow/plugins/cards/card_cli.py +20 -1
  83. metaflow/plugins/cards/card_creator.py +24 -1
  84. metaflow/plugins/cards/card_datastore.py +21 -49
  85. metaflow/plugins/cards/card_decorator.py +58 -6
  86. metaflow/plugins/cards/card_modules/basic.py +38 -9
  87. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  88. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  89. metaflow/plugins/cards/card_modules/components.py +592 -3
  90. metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
  91. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  92. metaflow/plugins/cards/card_modules/main.css +1 -0
  93. metaflow/plugins/cards/card_modules/main.js +56 -41
  94. metaflow/plugins/cards/card_modules/test_cards.py +22 -6
  95. metaflow/plugins/cards/component_serializer.py +1 -8
  96. metaflow/plugins/cards/metadata.py +22 -0
  97. metaflow/plugins/catch_decorator.py +9 -0
  98. metaflow/plugins/datastores/local_storage.py +12 -6
  99. metaflow/plugins/datastores/spin_storage.py +12 -0
  100. metaflow/plugins/datatools/s3/s3.py +49 -17
  101. metaflow/plugins/datatools/s3/s3op.py +113 -66
  102. metaflow/plugins/env_escape/client_modules.py +102 -72
  103. metaflow/plugins/events_decorator.py +127 -121
  104. metaflow/plugins/exit_hook/__init__.py +0 -0
  105. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  106. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  107. metaflow/plugins/kubernetes/kubernetes.py +12 -1
  108. metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
  109. metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
  110. metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
  111. metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
  112. metaflow/plugins/metadata_providers/local.py +76 -82
  113. metaflow/plugins/metadata_providers/service.py +13 -9
  114. metaflow/plugins/metadata_providers/spin.py +16 -0
  115. metaflow/plugins/package_cli.py +36 -24
  116. metaflow/plugins/parallel_decorator.py +11 -2
  117. metaflow/plugins/parsers.py +16 -0
  118. metaflow/plugins/pypi/bootstrap.py +7 -1
  119. metaflow/plugins/pypi/conda_decorator.py +41 -82
  120. metaflow/plugins/pypi/conda_environment.py +14 -6
  121. metaflow/plugins/pypi/micromamba.py +9 -1
  122. metaflow/plugins/pypi/pip.py +41 -5
  123. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  124. metaflow/plugins/pypi/utils.py +22 -0
  125. metaflow/plugins/secrets/__init__.py +3 -0
  126. metaflow/plugins/secrets/secrets_decorator.py +14 -178
  127. metaflow/plugins/secrets/secrets_func.py +49 -0
  128. metaflow/plugins/secrets/secrets_spec.py +101 -0
  129. metaflow/plugins/secrets/utils.py +74 -0
  130. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  131. metaflow/plugins/timeout_decorator.py +0 -1
  132. metaflow/plugins/uv/bootstrap.py +29 -1
  133. metaflow/plugins/uv/uv_environment.py +5 -3
  134. metaflow/pylint_wrapper.py +5 -1
  135. metaflow/runner/click_api.py +79 -26
  136. metaflow/runner/deployer.py +208 -6
  137. metaflow/runner/deployer_impl.py +32 -12
  138. metaflow/runner/metaflow_runner.py +266 -33
  139. metaflow/runner/subprocess_manager.py +21 -1
  140. metaflow/runner/utils.py +27 -16
  141. metaflow/runtime.py +660 -66
  142. metaflow/task.py +255 -26
  143. metaflow/user_configs/config_options.py +33 -21
  144. metaflow/user_configs/config_parameters.py +220 -58
  145. metaflow/user_decorators/__init__.py +0 -0
  146. metaflow/user_decorators/common.py +144 -0
  147. metaflow/user_decorators/mutable_flow.py +512 -0
  148. metaflow/user_decorators/mutable_step.py +424 -0
  149. metaflow/user_decorators/user_flow_decorator.py +264 -0
  150. metaflow/user_decorators/user_step_decorator.py +749 -0
  151. metaflow/util.py +197 -7
  152. metaflow/vendor.py +23 -7
  153. metaflow/version.py +1 -1
  154. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
  155. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
  156. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
  157. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
  158. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
  159. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  160. metaflow/_vendor/v3_5/__init__.py +0 -1
  161. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  162. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  163. metaflow/_vendor/v3_5/zipp.py +0 -329
  164. metaflow/info_file.py +0 -25
  165. metaflow/package.py +0 -203
  166. metaflow/user_configs/config_decorators.py +0 -568
  167. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
  168. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
  169. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -34,7 +34,7 @@ class TestPathSpecCard(MetaflowCard):
34
34
  class TestEditableCard(MetaflowCard):
35
35
  type = "test_editable_card"
36
36
 
37
- seperator = "$&#!!@*"
37
+ separator = "$&#!!@*"
38
38
 
39
39
  ALLOW_USER_COMPONENTS = True
40
40
 
@@ -42,13 +42,13 @@ class TestEditableCard(MetaflowCard):
42
42
  self._components = components
43
43
 
44
44
  def render(self, task):
45
- return self.seperator.join([str(comp) for comp in self._components])
45
+ return self.separator.join([str(comp) for comp in self._components])
46
46
 
47
47
 
48
48
  class TestEditableCard2(MetaflowCard):
49
49
  type = "test_editable_card_2"
50
50
 
51
- seperator = "$&#!!@*"
51
+ separator = "$&#!!@*"
52
52
 
53
53
  ALLOW_USER_COMPONENTS = True
54
54
 
@@ -56,19 +56,19 @@ class TestEditableCard2(MetaflowCard):
56
56
  self._components = components
57
57
 
58
58
  def render(self, task):
59
- return self.seperator.join([str(comp) for comp in self._components])
59
+ return self.separator.join([str(comp) for comp in self._components])
60
60
 
61
61
 
62
62
  class TestNonEditableCard(MetaflowCard):
63
63
  type = "test_non_editable_card"
64
64
 
65
- seperator = "$&#!!@*"
65
+ separator = "$&#!!@*"
66
66
 
67
67
  def __init__(self, components=[], **kwargs):
68
68
  self._components = components
69
69
 
70
70
  def render(self, task):
71
- return self.seperator.join([str(comp) for comp in self._components])
71
+ return self.separator.join([str(comp) for comp in self._components])
72
72
 
73
73
 
74
74
  class TestMockCard(MetaflowCard):
@@ -213,3 +213,19 @@ class TestRefreshComponentCard(MetaflowCard):
213
213
  if task.finished:
214
214
  return "final"
215
215
  return "runtime-%s" % _component_values_to_hash(data["components"])
216
+
217
+
218
+ class TestImageCard(MetaflowCard):
219
+ """Card that renders a tiny PNG using ``TaskToDict.parse_image``."""
220
+
221
+ type = "test_image_card"
222
+
223
+ def render(self, task):
224
+ from .convert_to_native_type import TaskToDict
225
+ import base64
226
+
227
+ png_bytes = base64.b64decode(
228
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGNgYGBgAAAABQABRDE8UwAAAABJRU5ErkJggg=="
229
+ )
230
+ img_src = TaskToDict().parse_image(png_bytes)
231
+ return f"<html><img src='{img_src}' /></html>"
@@ -57,15 +57,8 @@ class ComponentStore:
57
57
  The `_component_map` attribute is supposed to be a dictionary so that we can access the components by their ids.
58
58
  But we also want to maintain order in which components are inserted since all of these components are going to be visible on a UI.
59
59
  Since python3.6 dictionaries are ordered by default so we can use the default python `dict`.
60
- For python3.5 and below we need to use an OrderedDict since `dict`'s are not ordered by default.
61
60
  """
62
- python_version = int(platform.python_version_tuple()[0]) * 10 + int(
63
- platform.python_version_tuple()[1]
64
- )
65
- if python_version < 36:
66
- self._component_map = OrderedDict()
67
- else:
68
- self._component_map = {}
61
+ self._component_map = {}
69
62
 
70
63
  def __init__(self, logger, card_type=None, components=None, user_set_id=None):
71
64
  self._logger = logger
@@ -0,0 +1,22 @@
1
+ import json
2
+ from metaflow.metadata_provider import MetaDatum
3
+
4
+
5
+ def _save_metadata(
6
+ metadata_provider,
7
+ run_id,
8
+ step_name,
9
+ task_id,
10
+ attempt_id,
11
+ card_uuid,
12
+ save_metadata,
13
+ ):
14
+ entries = [
15
+ MetaDatum(
16
+ field=card_uuid,
17
+ value=json.dumps(save_metadata),
18
+ type="card-info",
19
+ tags=["attempt_id:{0}".format(attempt_id)],
20
+ )
21
+ ]
22
+ metadata_provider.register_metadata(run_id, step_name, task_id, entries)
@@ -52,6 +52,15 @@ class CatchDecorator(StepDecorator):
52
52
  "split steps." % step
53
53
  )
54
54
 
55
+ # Do not support catch on switch steps for now.
56
+ # When applying @catch to a switch step, we can not guarantee that the flow attribute used for the switching condition gets properly recorded.
57
+ if graph[step].type == "split-switch":
58
+ raise MetaflowException(
59
+ "@catch is defined for the step *%s* "
60
+ "but @catch is not supported in conditional "
61
+ "switch steps." % step
62
+ )
63
+
55
64
  def _print_exception(self, step, flow):
56
65
  self.logger(head="@catch caught an exception from %s" % flow, timestamp=False)
57
66
  for line in traceback.format_exc().splitlines():
@@ -1,24 +1,29 @@
1
1
  import json
2
2
  import os
3
3
 
4
- from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, DATASTORE_SYSROOT_LOCAL
4
+ from metaflow.metaflow_config import (
5
+ DATASTORE_LOCAL_DIR,
6
+ DATASTORE_SYSROOT_LOCAL,
7
+ )
5
8
  from metaflow.datastore.datastore_storage import CloseAfterUse, DataStoreStorage
6
9
 
7
10
 
8
11
  class LocalStorage(DataStoreStorage):
9
12
  TYPE = "local"
10
13
  METADATA_DIR = "_meta"
14
+ DATASTORE_DIR = DATASTORE_LOCAL_DIR # ".metaflow"
15
+ SYSROOT_VAR = DATASTORE_SYSROOT_LOCAL
11
16
 
12
17
  @classmethod
13
18
  def get_datastore_root_from_config(cls, echo, create_on_absent=True):
14
- result = DATASTORE_SYSROOT_LOCAL
19
+ result = cls.SYSROOT_VAR
15
20
  if result is None:
16
21
  try:
17
22
  # Python2
18
23
  current_path = os.getcwdu()
19
24
  except: # noqa E722
20
25
  current_path = os.getcwd()
21
- check_dir = os.path.join(current_path, DATASTORE_LOCAL_DIR)
26
+ check_dir = os.path.join(current_path, cls.DATASTORE_DIR)
22
27
  check_dir = os.path.realpath(check_dir)
23
28
  orig_path = check_dir
24
29
  top_level_reached = False
@@ -28,12 +33,13 @@ class LocalStorage(DataStoreStorage):
28
33
  top_level_reached = True
29
34
  break # We are no longer making upward progress
30
35
  current_path = new_path
31
- check_dir = os.path.join(current_path, DATASTORE_LOCAL_DIR)
36
+ check_dir = os.path.join(current_path, cls.DATASTORE_DIR)
32
37
  if top_level_reached:
33
38
  if create_on_absent:
34
39
  # Could not find any directory to use so create a new one
35
40
  echo(
36
- "Creating local datastore in current directory (%s)" % orig_path
41
+ "Creating %s datastore in current directory (%s)"
42
+ % (cls.TYPE, orig_path)
37
43
  )
38
44
  os.mkdir(orig_path)
39
45
  result = orig_path
@@ -42,7 +48,7 @@ class LocalStorage(DataStoreStorage):
42
48
  else:
43
49
  result = check_dir
44
50
  else:
45
- result = os.path.join(result, DATASTORE_LOCAL_DIR)
51
+ result = os.path.join(result, cls.DATASTORE_DIR)
46
52
  return result
47
53
 
48
54
  @staticmethod
@@ -0,0 +1,12 @@
1
+ from metaflow.metaflow_config import (
2
+ DATASTORE_SPIN_LOCAL_DIR,
3
+ DATASTORE_SYSROOT_SPIN,
4
+ )
5
+ from metaflow.plugins.datastores.local_storage import LocalStorage
6
+
7
+
8
+ class SpinStorage(LocalStorage):
9
+ TYPE = "spin"
10
+ METADATA_DIR = "_meta"
11
+ DATASTORE_DIR = DATASTORE_SPIN_LOCAL_DIR # ".metaflow_spin"
12
+ SYSROOT_VAR = DATASTORE_SYSROOT_SPIN
@@ -18,6 +18,7 @@ from metaflow.metaflow_config import (
18
18
  DATATOOLS_S3ROOT,
19
19
  S3_RETRY_COUNT,
20
20
  S3_TRANSIENT_RETRY_COUNT,
21
+ S3_LOG_TRANSIENT_RETRIES,
21
22
  S3_SERVER_SIDE_ENCRYPTION,
22
23
  S3_WORKER_COUNT,
23
24
  TEMPDIR,
@@ -498,16 +499,18 @@ class S3(object):
498
499
 
499
500
  Parameters
500
501
  ----------
501
- tmproot : str, default: '.'
502
+ tmproot : str, default '.'
502
503
  Where to store the temporary directory.
503
- bucket : str, optional
504
+ bucket : str, optional, default None
504
505
  Override the bucket from `DATATOOLS_S3ROOT` when `run` is specified.
505
- prefix : str, optional
506
+ prefix : str, optional, default None
506
507
  Override the path from `DATATOOLS_S3ROOT` when `run` is specified.
507
- run : FlowSpec or Run, optional
508
+ run : FlowSpec or Run, optional, default None
508
509
  Derive path prefix from the current or a past run ID, e.g. S3(run=self).
509
- s3root : str, optional
510
+ s3root : str, optional, default None
510
511
  If `run` is not specified, use this as the S3 prefix.
512
+ encryption : str, optional, default None
513
+ Server-side encryption to use when uploading objects to S3.
511
514
  """
512
515
 
513
516
  TYPE = "s3"
@@ -578,7 +581,13 @@ class S3(object):
578
581
  self._s3_inject_failures = kwargs.get(
579
582
  "inject_failure_rate", TEST_INJECT_RETRYABLE_FAILURES
580
583
  )
581
- self._tmpdir = mkdtemp(dir=tmproot, prefix="metaflow.s3.")
584
+ # Storing tmproot, bucket, ... as members to allow easier reconstruction
585
+ # during JSON deserialization
586
+ self._tmproot = tmproot
587
+ self._bucket = bucket
588
+ self._prefix = prefix
589
+ self._run = run
590
+ self._tmpdir = mkdtemp(dir=self._tmproot, prefix="metaflow.s3.")
582
591
  self._encryption = encryption
583
592
 
584
593
  def __enter__(self) -> "S3":
@@ -629,7 +638,9 @@ class S3(object):
629
638
  "Don't use absolute S3 URLs when the S3 client is "
630
639
  "initialized with a prefix. URL: %s" % key
631
640
  )
632
- return os.path.join(self._s3root, key)
641
+ # Strip leading slashes to ensure os.path.join works correctly
642
+ # os.path.join discards the first argument if the second starts with '/'
643
+ return os.path.join(self._s3root, key.lstrip("/"))
633
644
  else:
634
645
  return self._s3root
635
646
 
@@ -1385,6 +1396,9 @@ class S3(object):
1385
1396
  except OSError as e:
1386
1397
  if e.errno == errno.ENOSPC:
1387
1398
  raise MetaflowS3InsufficientDiskSpace(str(e))
1399
+ except MetaflowException as ex:
1400
+ # Re-raise Metaflow exceptions (including TimeoutException)
1401
+ raise
1388
1402
  except Exception as ex:
1389
1403
  error = str(ex)
1390
1404
  if tmp:
@@ -1757,17 +1771,35 @@ class S3(object):
1757
1771
  # due to a transient failure so we try again.
1758
1772
  transient_retry_count += 1
1759
1773
  total_ok_count += last_ok_count
1760
- print(
1761
- "Transient S3 failure (attempt #%d) -- total success: %d, "
1762
- "last attempt %d/%d -- remaining: %d"
1763
- % (
1764
- transient_retry_count,
1765
- total_ok_count,
1766
- last_ok_count,
1767
- last_ok_count + last_retry_count,
1768
- len(pending_retries),
1774
+
1775
+ if S3_LOG_TRANSIENT_RETRIES:
1776
+ # Extract transient error type from pending retry lines
1777
+ error_info = ""
1778
+ if pending_retries:
1779
+ try:
1780
+ # Parse the first line to get transient error type
1781
+ first_retry = json.loads(
1782
+ pending_retries[0].decode("utf-8").strip()
1783
+ )
1784
+ if "transient_error_type" in first_retry:
1785
+ error_info = (
1786
+ " (%s)" % first_retry["transient_error_type"]
1787
+ )
1788
+ except (json.JSONDecodeError, IndexError, KeyError):
1789
+ pass
1790
+
1791
+ print(
1792
+ "Transient S3 failure (attempt #%d) -- total success: %d, "
1793
+ "last attempt %d/%d -- remaining: %d%s"
1794
+ % (
1795
+ transient_retry_count,
1796
+ total_ok_count,
1797
+ last_ok_count,
1798
+ last_ok_count + last_retry_count,
1799
+ len(pending_retries),
1800
+ error_info,
1801
+ )
1769
1802
  )
1770
- )
1771
1803
  if inject_failures == 0:
1772
1804
  # Don't sleep when we are "faking" the failures
1773
1805
  self._jitter_sleep(transient_retry_count)
@@ -50,6 +50,7 @@ import metaflow.tracing as tracing
50
50
  from metaflow.metaflow_config import (
51
51
  S3_WORKER_COUNT,
52
52
  )
53
+ from metaflow.exception import MetaflowException
53
54
 
54
55
  DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
55
56
  DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1
@@ -131,7 +132,7 @@ def normalize_client_error(err):
131
132
  except ValueError:
132
133
  if error_code in ("AccessDenied", "AllAccessDisabled", "InvalidAccessKeyId"):
133
134
  return 403
134
- if error_code == "NoSuchKey":
135
+ if error_code in ("NoSuchKey", "NoSuchBucket"):
135
136
  return 404
136
137
  if error_code == "InvalidRange":
137
138
  return 416
@@ -284,14 +285,23 @@ def worker(result_file_name, queue, mode, s3config):
284
285
  "%d %d\n" % (idx, -ERROR_OUT_OF_DISK_SPACE)
285
286
  )
286
287
  else:
287
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
288
+ result_file.write(
289
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, "OSError")
290
+ )
288
291
  result_file.flush()
289
292
  continue
293
+ except MetaflowException:
294
+ # Re-raise Metaflow exceptions (including TimeoutException)
295
+ tmp.close()
296
+ os.unlink(tmp.name)
297
+ raise
290
298
  except (SSLError, Exception) as e:
291
299
  tmp.close()
292
300
  os.unlink(tmp.name)
293
301
  # assume anything else is transient
294
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
302
+ result_file.write(
303
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
304
+ )
295
305
  result_file.flush()
296
306
  continue
297
307
  # If we need the metadata, get it and write it out
@@ -357,9 +367,14 @@ def worker(result_file_name, queue, mode, s3config):
357
367
  err = convert_to_client_error(e)
358
368
  handle_client_error(err, idx, result_file)
359
369
  continue
370
+ except MetaflowException:
371
+ # Re-raise Metaflow exceptions (including TimeoutException)
372
+ raise
360
373
  except (SSLError, Exception) as e:
361
374
  # assume anything else is transient
362
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
375
+ result_file.write(
376
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
377
+ )
363
378
  result_file.flush()
364
379
  continue
365
380
  except:
@@ -385,7 +400,13 @@ def convert_to_client_error(e):
385
400
 
386
401
 
387
402
  def handle_client_error(err, idx, result_file):
403
+ # Handle all MetaflowExceptions as fatal
404
+ if isinstance(err, MetaflowException):
405
+ raise err
406
+
388
407
  error_code = normalize_client_error(err)
408
+ original_error_code = err.response["Error"]["Code"]
409
+
389
410
  if error_code == 404:
390
411
  result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
391
412
  result_file.flush()
@@ -393,13 +414,12 @@ def handle_client_error(err, idx, result_file):
393
414
  result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED))
394
415
  result_file.flush()
395
416
  elif error_code == 503:
396
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
417
+ result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
397
418
  result_file.flush()
398
419
  else:
399
420
  # optimistically assume it is a transient error
400
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
421
+ result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
401
422
  result_file.flush()
402
- # TODO specific error message for out of disk space
403
423
 
404
424
 
405
425
  def start_workers(mode, urls, num_workers, inject_failure, s3config):
@@ -411,6 +431,7 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
411
431
  random.seed()
412
432
 
413
433
  sz_results = []
434
+ transient_error_type = None
414
435
  # 1. push sources and destinations to the queue
415
436
  # We only push if we don't inject a failure; otherwise, we already set the sz_results
416
437
  # appropriately with the result of the injected failure.
@@ -465,13 +486,19 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
465
486
  # Read the output file if all went well
466
487
  with open(out_path, "r") as out_file:
467
488
  for line in out_file:
468
- line_split = line.split(" ")
469
- sz_results[int(line_split[0])] = int(line_split[1])
489
+ line_split = line.split(" ", 2)
490
+ idx = int(line_split[0])
491
+ size = int(line_split[1])
492
+ sz_results[idx] = size
493
+
494
+ # For transient errors, store the transient error type (should be the same for all)
495
+ if size == -ERROR_TRANSIENT and len(line_split) > 2:
496
+ transient_error_type = line_split[2].strip()
470
497
  else:
471
498
  # Put this process back in the processes to check
472
499
  new_procs[proc] = out_path
473
500
  procs = new_procs
474
- return sz_results
501
+ return sz_results, transient_error_type
475
502
 
476
503
 
477
504
  def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
@@ -480,7 +507,9 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
480
507
  print("%sing %d files.." % (mode.capitalize(), len(urls)), file=sys.stderr)
481
508
 
482
509
  start = time.time()
483
- sz_results = start_workers(mode, urls, num_workers, inject_failure, s3config)
510
+ sz_results, transient_error_type = start_workers(
511
+ mode, urls, num_workers, inject_failure, s3config
512
+ )
484
513
  end = time.time()
485
514
 
486
515
  if verbose:
@@ -497,7 +526,7 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
497
526
  ),
498
527
  file=sys.stderr,
499
528
  )
500
- return sz_results
529
+ return sz_results, transient_error_type
501
530
 
502
531
 
503
532
  # Utility functions
@@ -582,11 +611,12 @@ class S3Ops(object):
582
611
  # - the trailing slash is significant in S3
583
612
  if "Contents" in page:
584
613
  for key in page.get("Contents", []):
585
- url = url_base + key["Key"]
614
+ key_path = key["Key"].lstrip("/")
615
+ url = url_base + key_path
586
616
  urlobj = S3Url(
587
617
  url=url,
588
618
  bucket=prefix_url.bucket,
589
- path=key["Key"],
619
+ path=key_path,
590
620
  local=generate_local_path(url),
591
621
  prefix=prefix_url.url,
592
622
  )
@@ -694,9 +724,21 @@ def generate_local_path(url, range="whole", suffix=None):
694
724
  quoted = url_quote(url)
695
725
  fname = quoted.split(b"/")[-1].replace(b".", b"_").replace(b"-", b"_")
696
726
  sha = sha1(quoted).hexdigest()
727
+
728
+ # Truncate fname to ensure the final filename doesn't exceed filesystem limits.
729
+ # Most filesystems have a 255 character limit. The structure is:
730
+ # <40-char-sha>-<fname>-<range>[-<suffix>]
731
+ # We need to leave room for: sha (40) + hyphens (2-3) + range (~10) + suffix (~10)
732
+ # This leaves roughly 190 characters for fname. We use 150 to be safe.
733
+ fname_decoded = fname.decode("utf-8")
734
+ max_fname_len = 150
735
+ if len(fname_decoded) > max_fname_len:
736
+ # Truncate and add an ellipsis to indicate truncation
737
+ fname_decoded = fname_decoded[:max_fname_len] + "..."
738
+
697
739
  if suffix:
698
- return "-".join((sha, fname.decode("utf-8"), range, suffix))
699
- return "-".join((sha, fname.decode("utf-8"), range))
740
+ return "-".join((sha, fname_decoded, range, suffix))
741
+ return "-".join((sha, fname_decoded, range))
700
742
 
701
743
 
702
744
  def parallel_op(op, lst, num_workers):
@@ -833,11 +875,16 @@ def lst(
833
875
  urllist = []
834
876
  to_iterate, _ = _populate_prefixes(prefixes, inputs)
835
877
  for _, prefix, url, _ in to_iterate:
836
- src = urlparse(url)
878
+ src = urlparse(url, allow_fragments=False)
879
+ # We always consider the path being passed in to be a directory path so
880
+ # we add a trailing slash to the path if it doesn't already have one.
881
+ path_with_slash = src.path.lstrip("/")
882
+ if not path_with_slash.endswith("/"):
883
+ path_with_slash += "/"
837
884
  url = S3Url(
838
885
  url=url,
839
886
  bucket=src.netloc,
840
- path=src.path.lstrip("/"),
887
+ path=path_with_slash,
841
888
  local=None,
842
889
  prefix=prefix,
843
890
  )
@@ -939,7 +986,7 @@ def put(
939
986
  yield input_line_idx, local, url, content_type, metadata, encryption
940
987
 
941
988
  def _make_url(idx, local, user_url, content_type, metadata, encryption):
942
- src = urlparse(user_url)
989
+ src = urlparse(user_url, allow_fragments=False)
943
990
  url = S3Url(
944
991
  url=user_url,
945
992
  bucket=src.netloc,
@@ -967,7 +1014,7 @@ def put(
967
1014
  ul_op = "upload"
968
1015
  if not overwrite:
969
1016
  ul_op = "info_upload"
970
- sz_results = process_urls(
1017
+ sz_results, transient_error_type = process_urls(
971
1018
  ul_op, urls, verbose, inject_failure, num_workers, s3config
972
1019
  )
973
1020
  retry_lines = []
@@ -985,19 +1032,17 @@ def put(
985
1032
  elif listing and sz == 0:
986
1033
  out_lines.append(format_result_line(url.idx, url.url) + "\n")
987
1034
  elif sz == -ERROR_TRANSIENT:
988
- retry_lines.append(
989
- json.dumps(
990
- {
991
- "idx": url.idx,
992
- "url": url.url,
993
- "local": url.local,
994
- "content_type": url.content_type,
995
- "metadata": url.metadata,
996
- "encryption": url.encryption,
997
- }
998
- )
999
- + "\n"
1000
- )
1035
+ retry_data = {
1036
+ "idx": url.idx,
1037
+ "url": url.url,
1038
+ "local": url.local,
1039
+ "content_type": url.content_type,
1040
+ "metadata": url.metadata,
1041
+ "encryption": url.encryption,
1042
+ }
1043
+ if transient_error_type:
1044
+ retry_data["transient_error_type"] = transient_error_type
1045
+ retry_lines.append(json.dumps(retry_data) + "\n")
1001
1046
  # Output something to get a total count the first time around
1002
1047
  if not is_transient_retry:
1003
1048
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1035,22 +1080,21 @@ def _populate_prefixes(prefixes, inputs):
1035
1080
  for idx, l in enumerate(f, start=len(prefixes)):
1036
1081
  s = l.split(b" ")
1037
1082
  if len(s) == 1:
1083
+ # User input format: <url>
1038
1084
  url = url_unquote(s[0].strip())
1039
1085
  prefixes.append((idx, url, url, None))
1040
1086
  elif len(s) == 2:
1087
+ # User input format: <url> <range>
1041
1088
  url = url_unquote(s[0].strip())
1042
1089
  prefixes.append((idx, url, url, url_unquote(s[1].strip())))
1043
- else:
1090
+ elif len(s) in (4, 5):
1091
+ # Retry format: <idx> <prefix> <url> <range> [<transient_error_type>]
1092
+ # The transient_error_type (5th field) is optional and only used for logging.
1093
+ # Lines with other field counts (e.g., 3) are silently ignored as invalid.
1044
1094
  is_transient_retry = True
1045
- if len(s) == 3:
1046
- prefix = url = url_unquote(s[1].strip())
1047
- range_info = url_unquote(s[2].strip())
1048
- else:
1049
- # Special case when we have both prefix and URL -- this is
1050
- # used in recursive gets for example
1051
- prefix = url_unquote(s[1].strip())
1052
- url = url_unquote(s[2].strip())
1053
- range_info = url_unquote(s[3].strip())
1095
+ prefix = url_unquote(s[1].strip())
1096
+ url = url_unquote(s[2].strip())
1097
+ range_info = url_unquote(s[3].strip())
1054
1098
  if range_info == "<norange>":
1055
1099
  range_info = None
1056
1100
  prefixes.append(
@@ -1114,7 +1158,7 @@ def get(
1114
1158
  urllist = []
1115
1159
  to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
1116
1160
  for idx, prefix, url, r in to_iterate:
1117
- src = urlparse(url)
1161
+ src = urlparse(url, allow_fragments=False)
1118
1162
  url = S3Url(
1119
1163
  url=url,
1120
1164
  bucket=src.netloc,
@@ -1161,7 +1205,7 @@ def get(
1161
1205
 
1162
1206
  # exclude the non-existent files from loading
1163
1207
  to_load = [url for url, size in urls if size is not None]
1164
- sz_results = process_urls(
1208
+ sz_results, transient_error_type = process_urls(
1165
1209
  dl_op, to_load, verbose, inject_failure, num_workers, s3config
1166
1210
  )
1167
1211
  # We check if there is any access denied
@@ -1197,21 +1241,19 @@ def get(
1197
1241
  break
1198
1242
  out_lines.append(format_result_line(url.idx, url.url) + "\n")
1199
1243
  elif sz == -ERROR_TRANSIENT:
1200
- retry_lines.append(
1201
- " ".join(
1202
- [
1203
- str(url.idx),
1204
- url_quote(url.prefix).decode(encoding="utf-8"),
1205
- url_quote(url.url).decode(encoding="utf-8"),
1206
- (
1207
- url_quote(url.range).decode(encoding="utf-8")
1208
- if url.range
1209
- else "<norange>"
1210
- ),
1211
- ]
1212
- )
1213
- + "\n"
1214
- )
1244
+ retry_line_parts = [
1245
+ str(url.idx),
1246
+ url_quote(url.prefix).decode(encoding="utf-8"),
1247
+ url_quote(url.url).decode(encoding="utf-8"),
1248
+ (
1249
+ url_quote(url.range).decode(encoding="utf-8")
1250
+ if url.range
1251
+ else "<norange>"
1252
+ ),
1253
+ ]
1254
+ if transient_error_type:
1255
+ retry_line_parts.append(transient_error_type)
1256
+ retry_lines.append(" ".join(retry_line_parts) + "\n")
1215
1257
  # First time around, we output something to indicate the total length
1216
1258
  if not is_transient_retry:
1217
1259
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1263,7 +1305,7 @@ def info(
1263
1305
  urllist = []
1264
1306
  to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
1265
1307
  for idx, prefix, url, _ in to_iterate:
1266
- src = urlparse(url)
1308
+ src = urlparse(url, allow_fragments=False)
1267
1309
  url = S3Url(
1268
1310
  url=url,
1269
1311
  bucket=src.netloc,
@@ -1277,7 +1319,7 @@ def info(
1277
1319
  exit(ERROR_INVALID_URL, url)
1278
1320
  urllist.append(url)
1279
1321
 
1280
- sz_results = process_urls(
1322
+ sz_results, transient_error_type = process_urls(
1281
1323
  "info", urllist, verbose, inject_failure, num_workers, s3config
1282
1324
  )
1283
1325
 
@@ -1290,10 +1332,15 @@ def info(
1290
1332
  format_result_line(url.idx, url.prefix, url.url, url.local) + "\n"
1291
1333
  )
1292
1334
  else:
1293
- retry_lines.append(
1294
- "%d %s <norange>\n"
1295
- % (url.idx, url_quote(url.url).decode(encoding="utf-8"))
1296
- )
1335
+ retry_line_parts = [
1336
+ str(url.idx),
1337
+ url_quote(url.prefix).decode(encoding="utf-8"),
1338
+ url_quote(url.url).decode(encoding="utf-8"),
1339
+ "<norange>",
1340
+ ]
1341
+ if transient_error_type:
1342
+ retry_line_parts.append(transient_error_type)
1343
+ retry_lines.append(" ".join(retry_line_parts) + "\n")
1297
1344
  if not is_transient_retry:
1298
1345
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
1299
1346