ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (289) hide show
  1. metaflow/R.py +10 -7
  2. metaflow/__init__.py +40 -25
  3. metaflow/_vendor/imghdr/__init__.py +186 -0
  4. metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
  5. metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
  6. metaflow/_vendor/importlib_metadata/_collections.py +30 -0
  7. metaflow/_vendor/importlib_metadata/_compat.py +71 -0
  8. metaflow/_vendor/importlib_metadata/_functools.py +104 -0
  9. metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
  10. metaflow/_vendor/importlib_metadata/_meta.py +48 -0
  11. metaflow/_vendor/importlib_metadata/_text.py +99 -0
  12. metaflow/_vendor/importlib_metadata/py.typed +0 -0
  13. metaflow/_vendor/typeguard/__init__.py +48 -0
  14. metaflow/_vendor/typeguard/_checkers.py +1070 -0
  15. metaflow/_vendor/typeguard/_config.py +108 -0
  16. metaflow/_vendor/typeguard/_decorators.py +233 -0
  17. metaflow/_vendor/typeguard/_exceptions.py +42 -0
  18. metaflow/_vendor/typeguard/_functions.py +308 -0
  19. metaflow/_vendor/typeguard/_importhook.py +213 -0
  20. metaflow/_vendor/typeguard/_memo.py +48 -0
  21. metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
  22. metaflow/_vendor/typeguard/_suppression.py +86 -0
  23. metaflow/_vendor/typeguard/_transformer.py +1229 -0
  24. metaflow/_vendor/typeguard/_union_transformer.py +55 -0
  25. metaflow/_vendor/typeguard/_utils.py +173 -0
  26. metaflow/_vendor/typeguard/py.typed +0 -0
  27. metaflow/_vendor/typing_extensions.py +3641 -0
  28. metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
  29. metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
  30. metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
  31. metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
  32. metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
  33. metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
  34. metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
  35. metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
  36. metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
  37. metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
  38. metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
  39. metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
  40. metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
  41. metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
  42. metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
  43. metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
  44. metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
  45. metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
  46. metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
  47. metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
  48. metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
  49. metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
  50. metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
  51. metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
  52. metaflow/_vendor/yaml/__init__.py +427 -0
  53. metaflow/_vendor/yaml/composer.py +139 -0
  54. metaflow/_vendor/yaml/constructor.py +748 -0
  55. metaflow/_vendor/yaml/cyaml.py +101 -0
  56. metaflow/_vendor/yaml/dumper.py +62 -0
  57. metaflow/_vendor/yaml/emitter.py +1137 -0
  58. metaflow/_vendor/yaml/error.py +75 -0
  59. metaflow/_vendor/yaml/events.py +86 -0
  60. metaflow/_vendor/yaml/loader.py +63 -0
  61. metaflow/_vendor/yaml/nodes.py +49 -0
  62. metaflow/_vendor/yaml/parser.py +589 -0
  63. metaflow/_vendor/yaml/reader.py +185 -0
  64. metaflow/_vendor/yaml/representer.py +389 -0
  65. metaflow/_vendor/yaml/resolver.py +227 -0
  66. metaflow/_vendor/yaml/scanner.py +1435 -0
  67. metaflow/_vendor/yaml/serializer.py +111 -0
  68. metaflow/_vendor/yaml/tokens.py +104 -0
  69. metaflow/cards.py +5 -0
  70. metaflow/cli.py +331 -785
  71. metaflow/cli_args.py +17 -0
  72. metaflow/cli_components/__init__.py +0 -0
  73. metaflow/cli_components/dump_cmd.py +96 -0
  74. metaflow/cli_components/init_cmd.py +52 -0
  75. metaflow/cli_components/run_cmds.py +546 -0
  76. metaflow/cli_components/step_cmd.py +334 -0
  77. metaflow/cli_components/utils.py +140 -0
  78. metaflow/client/__init__.py +1 -0
  79. metaflow/client/core.py +467 -73
  80. metaflow/client/filecache.py +75 -35
  81. metaflow/clone_util.py +7 -1
  82. metaflow/cmd/code/__init__.py +231 -0
  83. metaflow/cmd/develop/stub_generator.py +756 -288
  84. metaflow/cmd/develop/stubs.py +12 -28
  85. metaflow/cmd/main_cli.py +6 -4
  86. metaflow/cmd/make_wrapper.py +78 -0
  87. metaflow/datastore/__init__.py +1 -0
  88. metaflow/datastore/content_addressed_store.py +41 -10
  89. metaflow/datastore/datastore_set.py +11 -2
  90. metaflow/datastore/flow_datastore.py +156 -10
  91. metaflow/datastore/spin_datastore.py +91 -0
  92. metaflow/datastore/task_datastore.py +154 -39
  93. metaflow/debug.py +5 -0
  94. metaflow/decorators.py +404 -78
  95. metaflow/exception.py +8 -2
  96. metaflow/extension_support/__init__.py +527 -376
  97. metaflow/extension_support/_empty_file.py +2 -2
  98. metaflow/extension_support/plugins.py +49 -31
  99. metaflow/flowspec.py +482 -33
  100. metaflow/graph.py +210 -42
  101. metaflow/includefile.py +84 -40
  102. metaflow/lint.py +141 -22
  103. metaflow/meta_files.py +13 -0
  104. metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
  105. metaflow/{metadata → metadata_provider}/metadata.py +86 -1
  106. metaflow/metaflow_config.py +175 -28
  107. metaflow/metaflow_config_funcs.py +51 -3
  108. metaflow/metaflow_current.py +4 -10
  109. metaflow/metaflow_environment.py +139 -53
  110. metaflow/metaflow_git.py +115 -0
  111. metaflow/metaflow_profile.py +18 -0
  112. metaflow/metaflow_version.py +150 -66
  113. metaflow/mflog/__init__.py +4 -3
  114. metaflow/mflog/save_logs.py +2 -2
  115. metaflow/multicore_utils.py +31 -14
  116. metaflow/package/__init__.py +673 -0
  117. metaflow/packaging_sys/__init__.py +880 -0
  118. metaflow/packaging_sys/backend.py +128 -0
  119. metaflow/packaging_sys/distribution_support.py +153 -0
  120. metaflow/packaging_sys/tar_backend.py +99 -0
  121. metaflow/packaging_sys/utils.py +54 -0
  122. metaflow/packaging_sys/v1.py +527 -0
  123. metaflow/parameters.py +149 -28
  124. metaflow/plugins/__init__.py +74 -5
  125. metaflow/plugins/airflow/airflow.py +40 -25
  126. metaflow/plugins/airflow/airflow_cli.py +22 -5
  127. metaflow/plugins/airflow/airflow_decorator.py +1 -1
  128. metaflow/plugins/airflow/airflow_utils.py +5 -3
  129. metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
  130. metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
  131. metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
  132. metaflow/plugins/argo/argo_client.py +78 -33
  133. metaflow/plugins/argo/argo_events.py +6 -6
  134. metaflow/plugins/argo/argo_workflows.py +2410 -527
  135. metaflow/plugins/argo/argo_workflows_cli.py +571 -121
  136. metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
  137. metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
  138. metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
  139. metaflow/plugins/argo/capture_error.py +73 -0
  140. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  141. metaflow/plugins/argo/exit_hooks.py +209 -0
  142. metaflow/plugins/argo/jobset_input_paths.py +15 -0
  143. metaflow/plugins/argo/param_val.py +19 -0
  144. metaflow/plugins/aws/aws_client.py +10 -3
  145. metaflow/plugins/aws/aws_utils.py +55 -2
  146. metaflow/plugins/aws/batch/batch.py +72 -5
  147. metaflow/plugins/aws/batch/batch_cli.py +33 -10
  148. metaflow/plugins/aws/batch/batch_client.py +4 -3
  149. metaflow/plugins/aws/batch/batch_decorator.py +102 -35
  150. metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
  151. metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
  152. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  153. metaflow/plugins/aws/step_functions/step_functions.py +65 -8
  154. metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
  155. metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
  156. metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
  157. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
  158. metaflow/plugins/azure/azure_exceptions.py +1 -1
  159. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  160. metaflow/plugins/azure/azure_tail.py +1 -1
  161. metaflow/plugins/azure/includefile_support.py +2 -0
  162. metaflow/plugins/cards/card_cli.py +66 -30
  163. metaflow/plugins/cards/card_creator.py +25 -1
  164. metaflow/plugins/cards/card_datastore.py +21 -49
  165. metaflow/plugins/cards/card_decorator.py +132 -8
  166. metaflow/plugins/cards/card_modules/basic.py +112 -17
  167. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  168. metaflow/plugins/cards/card_modules/card.py +16 -1
  169. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  170. metaflow/plugins/cards/card_modules/components.py +665 -28
  171. metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
  172. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  173. metaflow/plugins/cards/card_modules/main.css +1 -0
  174. metaflow/plugins/cards/card_modules/main.js +68 -49
  175. metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
  176. metaflow/plugins/cards/card_modules/test_cards.py +26 -12
  177. metaflow/plugins/cards/card_server.py +39 -14
  178. metaflow/plugins/cards/component_serializer.py +2 -9
  179. metaflow/plugins/cards/metadata.py +22 -0
  180. metaflow/plugins/catch_decorator.py +9 -0
  181. metaflow/plugins/datastores/azure_storage.py +10 -1
  182. metaflow/plugins/datastores/gs_storage.py +6 -2
  183. metaflow/plugins/datastores/local_storage.py +12 -6
  184. metaflow/plugins/datastores/spin_storage.py +12 -0
  185. metaflow/plugins/datatools/local.py +2 -0
  186. metaflow/plugins/datatools/s3/s3.py +126 -75
  187. metaflow/plugins/datatools/s3/s3op.py +254 -121
  188. metaflow/plugins/env_escape/__init__.py +3 -3
  189. metaflow/plugins/env_escape/client_modules.py +102 -72
  190. metaflow/plugins/env_escape/server.py +7 -0
  191. metaflow/plugins/env_escape/stub.py +24 -5
  192. metaflow/plugins/events_decorator.py +343 -185
  193. metaflow/plugins/exit_hook/__init__.py +0 -0
  194. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  195. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  196. metaflow/plugins/gcp/__init__.py +1 -1
  197. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
  198. metaflow/plugins/gcp/gs_tail.py +10 -6
  199. metaflow/plugins/gcp/includefile_support.py +3 -0
  200. metaflow/plugins/kubernetes/kube_utils.py +108 -0
  201. metaflow/plugins/kubernetes/kubernetes.py +411 -130
  202. metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
  203. metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
  204. metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
  205. metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
  206. metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
  207. metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
  208. metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
  209. metaflow/plugins/logs_cli.py +359 -0
  210. metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
  211. metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
  212. metaflow/plugins/metadata_providers/spin.py +16 -0
  213. metaflow/plugins/package_cli.py +36 -24
  214. metaflow/plugins/parallel_decorator.py +128 -11
  215. metaflow/plugins/parsers.py +16 -0
  216. metaflow/plugins/project_decorator.py +51 -5
  217. metaflow/plugins/pypi/bootstrap.py +357 -105
  218. metaflow/plugins/pypi/conda_decorator.py +82 -81
  219. metaflow/plugins/pypi/conda_environment.py +187 -52
  220. metaflow/plugins/pypi/micromamba.py +157 -47
  221. metaflow/plugins/pypi/parsers.py +268 -0
  222. metaflow/plugins/pypi/pip.py +88 -13
  223. metaflow/plugins/pypi/pypi_decorator.py +37 -1
  224. metaflow/plugins/pypi/utils.py +48 -2
  225. metaflow/plugins/resources_decorator.py +2 -2
  226. metaflow/plugins/secrets/__init__.py +3 -0
  227. metaflow/plugins/secrets/secrets_decorator.py +26 -181
  228. metaflow/plugins/secrets/secrets_func.py +49 -0
  229. metaflow/plugins/secrets/secrets_spec.py +101 -0
  230. metaflow/plugins/secrets/utils.py +74 -0
  231. metaflow/plugins/tag_cli.py +4 -7
  232. metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
  233. metaflow/plugins/timeout_decorator.py +3 -3
  234. metaflow/plugins/uv/__init__.py +0 -0
  235. metaflow/plugins/uv/bootstrap.py +128 -0
  236. metaflow/plugins/uv/uv_environment.py +72 -0
  237. metaflow/procpoll.py +1 -1
  238. metaflow/pylint_wrapper.py +5 -1
  239. metaflow/runner/__init__.py +0 -0
  240. metaflow/runner/click_api.py +717 -0
  241. metaflow/runner/deployer.py +470 -0
  242. metaflow/runner/deployer_impl.py +201 -0
  243. metaflow/runner/metaflow_runner.py +714 -0
  244. metaflow/runner/nbdeploy.py +132 -0
  245. metaflow/runner/nbrun.py +225 -0
  246. metaflow/runner/subprocess_manager.py +650 -0
  247. metaflow/runner/utils.py +335 -0
  248. metaflow/runtime.py +1078 -260
  249. metaflow/sidecar/sidecar_worker.py +1 -1
  250. metaflow/system/__init__.py +5 -0
  251. metaflow/system/system_logger.py +85 -0
  252. metaflow/system/system_monitor.py +108 -0
  253. metaflow/system/system_utils.py +19 -0
  254. metaflow/task.py +521 -225
  255. metaflow/tracing/__init__.py +7 -7
  256. metaflow/tracing/span_exporter.py +31 -38
  257. metaflow/tracing/tracing_modules.py +38 -43
  258. metaflow/tuple_util.py +27 -0
  259. metaflow/user_configs/__init__.py +0 -0
  260. metaflow/user_configs/config_options.py +563 -0
  261. metaflow/user_configs/config_parameters.py +598 -0
  262. metaflow/user_decorators/__init__.py +0 -0
  263. metaflow/user_decorators/common.py +144 -0
  264. metaflow/user_decorators/mutable_flow.py +512 -0
  265. metaflow/user_decorators/mutable_step.py +424 -0
  266. metaflow/user_decorators/user_flow_decorator.py +264 -0
  267. metaflow/user_decorators/user_step_decorator.py +749 -0
  268. metaflow/util.py +243 -27
  269. metaflow/vendor.py +23 -7
  270. metaflow/version.py +1 -1
  271. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
  272. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
  273. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
  274. ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
  275. ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
  276. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  277. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
  278. metaflow/_vendor/v3_5/__init__.py +0 -1
  279. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  280. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  281. metaflow/package.py +0 -188
  282. ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
  283. ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
  284. /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
  285. /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
  286. /metaflow/{metadata → metadata_provider}/util.py +0 -0
  287. /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
  288. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
  289. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
metaflow/runtime.py CHANGED
@@ -4,39 +4,61 @@ Local backend
4
4
  Execute the flow with a native runtime
5
5
  using local / remote processes
6
6
  """
7
+
7
8
  from __future__ import print_function
9
+ import json
8
10
  import os
9
11
  import sys
10
12
  import fcntl
13
+ import re
14
+ import tempfile
11
15
  import time
12
16
  import subprocess
13
17
  from datetime import datetime
18
+ from enum import Enum
14
19
  from io import BytesIO
20
+ from itertools import chain
15
21
  from functools import partial
16
22
  from concurrent import futures
17
23
 
24
+ from typing import Dict, Tuple
18
25
  from metaflow.datastore.exceptions import DataException
26
+ from contextlib import contextmanager
19
27
 
20
28
  from . import get_namespace
21
- from .metadata import MetaDatum
22
- from .metaflow_config import MAX_ATTEMPTS, UI_URL
29
+ from .client.filecache import FileCache, FileBlobCache, TaskMetadataCache
30
+ from .metadata_provider import MetaDatum
31
+ from .metaflow_config import (
32
+ FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
33
+ MAX_ATTEMPTS,
34
+ UI_URL,
35
+ SPIN_ALLOWED_DECORATORS,
36
+ SPIN_DISALLOWED_DECORATORS,
37
+ )
38
+ from .metaflow_profile import from_start
39
+ from .plugins import DATASTORES
23
40
  from .exception import (
24
41
  MetaflowException,
25
42
  MetaflowInternalError,
26
43
  METAFLOW_EXIT_DISALLOW_RETRY,
27
44
  )
28
45
  from . import procpoll
29
- from .datastore import TaskDataStoreSet
46
+ from .datastore import FlowDataStore, TaskDataStoreSet
30
47
  from .debug import debug
31
48
  from .decorators import flow_decorators
49
+ from .flowspec import FlowStateItems
32
50
  from .mflog import mflog, RUNTIME_LOG_SOURCE
33
- from .util import to_unicode, compress_list, unicode_type
51
+ from .util import to_unicode, compress_list, unicode_type, get_latest_task_pathspec
34
52
  from .clone_util import clone_task_helper
35
53
  from .unbounded_foreach import (
36
54
  CONTROL_TASK_TAG,
37
55
  UBF_CONTROL,
38
56
  UBF_TASK,
39
57
  )
58
+
59
+ from .user_configs.config_options import ConfigInput
60
+ from .user_configs.config_parameters import dump_config_values
61
+
40
62
  import metaflow.tracing as tracing
41
63
 
42
64
  MAX_WORKERS = 16
@@ -47,9 +69,24 @@ PROGRESS_INTERVAL = 300 # s
47
69
  # The following is a list of the (data) artifacts used by the runtime while
48
70
  # executing a flow. These are prefetched during the resume operation by
49
71
  # leveraging the TaskDataStoreSet.
50
- PREFETCH_DATA_ARTIFACTS = ["_foreach_stack", "_task_ok", "_transition"]
72
+ PREFETCH_DATA_ARTIFACTS = [
73
+ "_foreach_stack",
74
+ "_iteration_stack",
75
+ "_task_ok",
76
+ "_transition",
77
+ "_control_mapper_tasks",
78
+ "_control_task_is_mapper_zero",
79
+ ]
51
80
  RESUME_POLL_SECONDS = 60
52
81
 
82
+
83
+ class LoopBehavior(Enum):
84
+ NONE = "none"
85
+ ENTERING = "entering"
86
+ EXITING = "exiting"
87
+ LOOPING = "looping"
88
+
89
+
53
90
  # Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
54
91
  # formats according to mflog. See a comment in mflog.__init__
55
92
  mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
@@ -57,6 +94,253 @@ mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
57
94
  # TODO option: output dot graph periodically about execution
58
95
 
59
96
 
97
+ class SpinRuntime(object):
98
+ def __init__(
99
+ self,
100
+ flow,
101
+ graph,
102
+ flow_datastore,
103
+ metadata,
104
+ environment,
105
+ package,
106
+ logger,
107
+ entrypoint,
108
+ event_logger,
109
+ monitor,
110
+ step_func,
111
+ step_name,
112
+ spin_pathspec,
113
+ skip_decorators=False,
114
+ artifacts_module=None,
115
+ persist=True,
116
+ max_log_size=MAX_LOG_SIZE,
117
+ ):
118
+ from metaflow import Task
119
+
120
+ self._flow = flow
121
+ self._graph = graph
122
+ self._flow_datastore = flow_datastore
123
+ self._metadata = metadata
124
+ self._environment = environment
125
+ self._package = package
126
+ self._logger = logger
127
+ self._entrypoint = entrypoint
128
+ self._event_logger = event_logger
129
+ self._monitor = monitor
130
+
131
+ self._step_func = step_func
132
+
133
+ # Determine if we have a complete pathspec or need to get the task
134
+ if spin_pathspec:
135
+ parts = spin_pathspec.split("/")
136
+ if len(parts) == 4:
137
+ # Complete pathspec: flow/run/step/task_id
138
+ try:
139
+ # If user provides whole pathspec, we do not need to check namespace
140
+ task = Task(spin_pathspec, _namespace_check=False)
141
+ except Exception:
142
+ raise MetaflowException(
143
+ f"Invalid pathspec: {spin_pathspec} for step: {step_name}"
144
+ )
145
+ elif len(parts) == 3:
146
+ # Partial pathspec: flow/run/step - need to get the task
147
+ _, run_id, _ = parts
148
+ task = get_latest_task_pathspec(flow.name, step_name, run_id=run_id)
149
+ logger(
150
+ f"To make spin even faster, provide complete pathspec with task_id: {task.pathspec}",
151
+ system_msg=True,
152
+ )
153
+ else:
154
+ raise MetaflowException(
155
+ f"Invalid pathspec format: {spin_pathspec}. Expected flow/run/step or flow/run/step/task_id"
156
+ )
157
+ else:
158
+ # No pathspec provided, get latest task for this step
159
+ task = get_latest_task_pathspec(flow.name, step_name)
160
+ logger(
161
+ f"To make spin even faster, provide complete pathspec {task.pathspec}",
162
+ system_msg=True,
163
+ )
164
+ from_start("SpinRuntime: after getting task")
165
+
166
+ # Get the original FlowDatastore so we can use it to access artifacts from the
167
+ # spun task
168
+ meta_dict = task.metadata_dict
169
+ ds_type = meta_dict["ds-type"]
170
+ ds_root = meta_dict["ds-root"]
171
+ orig_datastore_impl = [d for d in DATASTORES if d.TYPE == ds_type][0]
172
+ orig_datastore_impl.datastore_root = ds_root
173
+ spin_pathspec = task.pathspec
174
+ orig_flow_datastore = FlowDataStore(
175
+ flow.name,
176
+ environment=None,
177
+ storage_impl=orig_datastore_impl,
178
+ ds_root=ds_root,
179
+ )
180
+
181
+ self._filecache = FileCache()
182
+ orig_flow_datastore.set_metadata_cache(
183
+ TaskMetadataCache(self._filecache, ds_type, ds_root, flow.name)
184
+ )
185
+ orig_flow_datastore.ca_store.set_blob_cache(
186
+ FileBlobCache(
187
+ self._filecache, FileCache.flow_ds_id(ds_type, ds_root, flow.name)
188
+ )
189
+ )
190
+
191
+ self._orig_flow_datastore = orig_flow_datastore
192
+ self._spin_pathspec = spin_pathspec
193
+ self._persist = persist
194
+ self._spin_task = task
195
+ self._input_paths = None
196
+ self._split_index = None
197
+ self._whitelist_decorators = None
198
+ self._config_file_name = None
199
+ self._skip_decorators = skip_decorators
200
+ self._artifacts_module = artifacts_module
201
+ self._max_log_size = max_log_size
202
+ self._encoding = sys.stdout.encoding or "UTF-8"
203
+
204
+ # Create a new run_id for the spin task
205
+ self.run_id = self._metadata.new_run_id()
206
+ # Raise exception if we have a black listed decorator
207
+ for deco in self._step_func.decorators:
208
+ if deco.name in SPIN_DISALLOWED_DECORATORS:
209
+ raise MetaflowException(
210
+ f"Spinning steps with @{deco.name} decorator is not supported."
211
+ )
212
+
213
+ for deco in self.whitelist_decorators:
214
+ deco.runtime_init(flow, graph, package, self.run_id)
215
+ from_start("SpinRuntime: after init decorators")
216
+
217
+ @property
218
+ def split_index(self):
219
+ """
220
+ Returns the split index, caching the result after the first access.
221
+ """
222
+ if self._split_index is None:
223
+ self._split_index = getattr(self._spin_task, "index", None)
224
+
225
+ return self._split_index
226
+
227
+ @property
228
+ def input_paths(self):
229
+ def _format_input_paths(task_pathspec, attempt):
230
+ _, run_id, step_name, task_id = task_pathspec.split("/")
231
+ return f"{run_id}/{step_name}/{task_id}/{attempt}"
232
+
233
+ if self._input_paths:
234
+ return self._input_paths
235
+
236
+ if self._step_func.name == "start":
237
+ from metaflow import Step
238
+
239
+ flow_name, run_id, _, _ = self._spin_pathspec.split("/")
240
+ task = Step(
241
+ f"{flow_name}/{run_id}/_parameters", _namespace_check=False
242
+ ).task
243
+ self._input_paths = [
244
+ _format_input_paths(task.pathspec, task.current_attempt)
245
+ ]
246
+ else:
247
+ parent_tasks = self._spin_task.parent_tasks
248
+ self._input_paths = [
249
+ _format_input_paths(t.pathspec, t.current_attempt) for t in parent_tasks
250
+ ]
251
+ return self._input_paths
252
+
253
+ @property
254
+ def whitelist_decorators(self):
255
+ if self._skip_decorators:
256
+ self._whitelist_decorators = []
257
+ return self._whitelist_decorators
258
+ if self._whitelist_decorators:
259
+ return self._whitelist_decorators
260
+ self._whitelist_decorators = [
261
+ deco
262
+ for deco in self._step_func.decorators
263
+ if any(deco.name.startswith(prefix) for prefix in SPIN_ALLOWED_DECORATORS)
264
+ ]
265
+ return self._whitelist_decorators
266
+
267
+ def _new_task(self, step, input_paths=None, **kwargs):
268
+ return Task(
269
+ flow_datastore=self._flow_datastore,
270
+ flow=self._flow,
271
+ step=step,
272
+ run_id=self.run_id,
273
+ metadata=self._metadata,
274
+ environment=self._environment,
275
+ entrypoint=self._entrypoint,
276
+ event_logger=self._event_logger,
277
+ monitor=self._monitor,
278
+ input_paths=input_paths,
279
+ decos=self.whitelist_decorators,
280
+ logger=self._logger,
281
+ split_index=self.split_index,
282
+ **kwargs,
283
+ )
284
+
285
+ def execute(self):
286
+ exception = None
287
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
288
+ config_value = dump_config_values(self._flow)
289
+ if config_value:
290
+ json.dump(config_value, config_file)
291
+ config_file.flush()
292
+ self._config_file_name = config_file.name
293
+ else:
294
+ self._config_file_name = None
295
+ from_start("SpinRuntime: config values processed")
296
+ self.task = self._new_task(self._step_func.name, self.input_paths)
297
+ try:
298
+ self._launch_and_monitor_task()
299
+ except Exception as ex:
300
+ self._logger("Task failed.", system_msg=True, bad=True)
301
+ exception = ex
302
+ raise
303
+ finally:
304
+ for deco in self.whitelist_decorators:
305
+ deco.runtime_finished(exception)
306
+
307
+ def _launch_and_monitor_task(self):
308
+ worker = Worker(
309
+ self.task,
310
+ self._max_log_size,
311
+ self._config_file_name,
312
+ orig_flow_datastore=self._orig_flow_datastore,
313
+ spin_pathspec=self._spin_pathspec,
314
+ artifacts_module=self._artifacts_module,
315
+ persist=self._persist,
316
+ skip_decorators=self._skip_decorators,
317
+ )
318
+ from_start("SpinRuntime: created worker")
319
+
320
+ poll = procpoll.make_poll()
321
+ fds = worker.fds()
322
+ for fd in fds:
323
+ poll.add(fd)
324
+
325
+ active_fds = set(fds)
326
+
327
+ while active_fds:
328
+ events = poll.poll(POLL_TIMEOUT)
329
+ for event in events:
330
+ if event.can_read:
331
+ worker.read_logline(event.fd)
332
+ if event.is_terminated:
333
+ poll.remove(event.fd)
334
+ active_fds.remove(event.fd)
335
+ from_start("SpinRuntime: read loglines")
336
+ returncode = worker.terminate()
337
+ from_start("SpinRuntime: worker terminated")
338
+ if returncode != 0:
339
+ raise TaskFailed(self.task, f"Task failed with return code {returncode}")
340
+ else:
341
+ self._logger("Task finished successfully.", system_msg=True)
342
+
343
+
60
344
  class NativeRuntime(object):
61
345
  def __init__(
62
346
  self,
@@ -74,11 +358,12 @@ class NativeRuntime(object):
74
358
  clone_run_id=None,
75
359
  clone_only=False,
76
360
  reentrant=False,
77
- clone_steps=None,
361
+ steps_to_rerun=None,
78
362
  max_workers=MAX_WORKERS,
79
363
  max_num_splits=MAX_NUM_SPLITS,
80
364
  max_log_size=MAX_LOG_SIZE,
81
365
  resume_identifier=None,
366
+ skip_decorator_hooks=False,
82
367
  ):
83
368
  if run_id is None:
84
369
  self._run_id = metadata.new_run_id()
@@ -91,6 +376,7 @@ class NativeRuntime(object):
91
376
  self._flow_datastore = flow_datastore
92
377
  self._metadata = metadata
93
378
  self._environment = environment
379
+ self._package = package
94
380
  self._logger = logger
95
381
  self._max_workers = max_workers
96
382
  self._active_tasks = dict() # Key: step name;
@@ -108,9 +394,21 @@ class NativeRuntime(object):
108
394
 
109
395
  self._clone_run_id = clone_run_id
110
396
  self._clone_only = clone_only
111
- self._clone_steps = {} if clone_steps is None else clone_steps
397
+ self._cloned_tasks = []
398
+ self._ran_or_scheduled_task_index = set()
112
399
  self._reentrant = reentrant
113
400
  self._run_url = None
401
+ self._skip_decorator_hooks = skip_decorator_hooks
402
+
403
+ # If steps_to_rerun is specified, we will not clone them in resume mode.
404
+ self._steps_to_rerun = steps_to_rerun or {}
405
+ # sorted_nodes are in topological order already, so we only need to
406
+ # iterate through the nodes once to get a stable set of rerun steps.
407
+ for step_name in self._graph.sorted_nodes:
408
+ if step_name in self._steps_to_rerun:
409
+ out_funcs = self._graph[step_name].out_funcs or []
410
+ for next_step in out_funcs:
411
+ self._steps_to_rerun.add(next_step)
114
412
 
115
413
  self._origin_ds_set = None
116
414
  if clone_run_id:
@@ -152,21 +450,21 @@ class NativeRuntime(object):
152
450
  # finished.
153
451
  self._control_num_splits = {} # control_task -> num_splits mapping
154
452
 
155
- for step in flow:
156
- for deco in step.decorators:
157
- deco.runtime_init(flow, graph, package, self._run_id)
453
+ if not self._skip_decorator_hooks:
454
+ for step in flow:
455
+ for deco in step.decorators:
456
+ deco.runtime_init(flow, graph, package, self._run_id)
158
457
 
159
458
  def _new_task(self, step, input_paths=None, **kwargs):
160
-
161
459
  if input_paths is None:
162
460
  may_clone = True
163
461
  else:
164
462
  may_clone = all(self._is_cloned[path] for path in input_paths)
165
463
 
166
- if step in self._clone_steps:
464
+ if step in self._steps_to_rerun:
167
465
  may_clone = False
168
466
 
169
- if step == "_parameters":
467
+ if step == "_parameters" or self._skip_decorator_hooks:
170
468
  decos = []
171
469
  else:
172
470
  decos = getattr(self._flow, step).decorators
@@ -204,6 +502,22 @@ class NativeRuntime(object):
204
502
 
205
503
  self._is_cloned[self._params_task.path] = self._params_task.is_cloned
206
504
 
505
+ def should_skip_clone_only_execution(self):
506
+ (
507
+ should_skip_clone_only_execution,
508
+ skip_reason,
509
+ ) = self._should_skip_clone_only_execution()
510
+ if should_skip_clone_only_execution:
511
+ self._logger(skip_reason, system_msg=True)
512
+ return True
513
+ return False
514
+
515
+ @contextmanager
516
+ def run_heartbeat(self):
517
+ self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
518
+ yield
519
+ self._metadata.stop_heartbeat()
520
+
207
521
  def print_workflow_info(self):
208
522
  self._run_url = (
209
523
  "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
@@ -236,157 +550,375 @@ class NativeRuntime(object):
236
550
  )
237
551
  return False, None
238
552
 
239
- def clone_task(self, step_name, task_id):
240
- self._logger(
241
- "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
553
+ def clone_task(
554
+ self,
555
+ step_name,
556
+ task_id,
557
+ pathspec_index,
558
+ cloned_task_pathspec_index,
559
+ finished_tuple,
560
+ iteration_tuple,
561
+ ubf_context,
562
+ generate_task_obj,
563
+ verbose=False,
564
+ ):
565
+ try:
566
+ new_task_id = task_id
567
+ if generate_task_obj:
568
+ task = self._new_task(step_name, pathspec_index=pathspec_index)
569
+ if ubf_context:
570
+ task.ubf_context = ubf_context
571
+ new_task_id = task.task_id
572
+ self._cloned_tasks.append(task)
573
+ self._ran_or_scheduled_task_index.add(cloned_task_pathspec_index)
574
+ task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
575
+ else:
576
+ task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
577
+ Task.clone_pathspec_mapping[task_pathspec] = "{}/{}/{}".format(
578
+ self._clone_run_id, step_name, task_id
579
+ )
580
+ if verbose:
581
+ self._logger(
582
+ "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
583
+ self._flow.name,
584
+ self._clone_run_id,
585
+ step_name,
586
+ task_id,
587
+ self._flow.name,
588
+ self._run_id,
589
+ step_name,
590
+ new_task_id,
591
+ ),
592
+ system_msg=True,
593
+ )
594
+ clone_task_helper(
242
595
  self._flow.name,
243
596
  self._clone_run_id,
244
- step_name,
245
- task_id,
246
- self._flow.name,
247
597
  self._run_id,
248
598
  step_name,
249
- task_id,
250
- ),
251
- system_msg=True,
252
- )
253
- clone_task_helper(
254
- self._flow.name,
255
- self._clone_run_id,
256
- self._run_id,
257
- step_name,
258
- task_id, # origin_task_id
259
- task_id,
260
- self._flow_datastore,
261
- self._metadata,
262
- origin_ds_set=self._origin_ds_set,
263
- )
599
+ task_id, # origin_task_id
600
+ new_task_id,
601
+ self._flow_datastore,
602
+ self._metadata,
603
+ origin_ds_set=self._origin_ds_set,
604
+ )
605
+ self._finished[(step_name, finished_tuple, iteration_tuple)] = task_pathspec
606
+ self._is_cloned[task_pathspec] = True
607
+ except Exception as e:
608
+ self._logger(
609
+ "Cloning {}/{}/{}/{} failed with error: {}".format(
610
+ self._flow.name, self._clone_run_id, step_name, task_id, str(e)
611
+ )
612
+ )
264
613
 
265
- def clone_original_run(self):
266
- (
267
- should_skip_clone_only_execution,
268
- skip_reason,
269
- ) = self._should_skip_clone_only_execution()
270
- if should_skip_clone_only_execution:
271
- self._logger(skip_reason, system_msg=True)
272
- return
273
- self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
614
+ def clone_original_run(self, generate_task_obj=False, verbose=True):
274
615
  self._logger(
275
- "Start cloning original run: {}/{}".format(
276
- self._flow.name, self._clone_run_id
277
- ),
616
+ "Cloning {}/{}".format(self._flow.name, self._clone_run_id),
278
617
  system_msg=True,
279
618
  )
280
619
 
281
620
  inputs = []
282
621
 
622
+ ubf_mapper_tasks_to_clone = set()
623
+ ubf_control_tasks = set()
624
+ # We only clone ubf mapper tasks if the control task is complete.
625
+ # Here we need to check which control tasks are complete, and then get the corresponding
626
+ # mapper tasks.
283
627
  for task_ds in self._origin_ds_set:
284
628
  _, step_name, task_id = task_ds.pathspec.split("/")
629
+ pathspec_index = task_ds.pathspec_index
285
630
  if task_ds["_task_ok"] and step_name != "_parameters":
286
- inputs.append((step_name, task_id))
631
+ # Control task contains "_control_mapper_tasks" but, in the case of
632
+ # @parallel decorator, the control task is also a mapper task so we
633
+ # need to distinguish this using _control_task_is_mapper_zero
634
+ control_mapper_tasks = (
635
+ []
636
+ if "_control_mapper_tasks" not in task_ds
637
+ else task_ds["_control_mapper_tasks"]
638
+ )
639
+ if control_mapper_tasks:
640
+ if task_ds.get("_control_task_is_mapper_zero", False):
641
+ # Strip out the control task of list of mapper tasks
642
+ ubf_control_tasks.add(control_mapper_tasks[0])
643
+ ubf_mapper_tasks_to_clone.update(control_mapper_tasks[1:])
644
+ else:
645
+ ubf_mapper_tasks_to_clone.update(control_mapper_tasks)
646
+ # Since we only add mapper tasks here, if we are not in the list
647
+ # we are a control task
648
+ if task_ds.pathspec not in ubf_mapper_tasks_to_clone:
649
+ ubf_control_tasks.add(task_ds.pathspec)
650
+
651
+ for task_ds in self._origin_ds_set:
652
+ _, step_name, task_id = task_ds.pathspec.split("/")
653
+ pathspec_index = task_ds.pathspec_index
654
+
655
+ if (
656
+ task_ds["_task_ok"]
657
+ and step_name != "_parameters"
658
+ and (step_name not in self._steps_to_rerun)
659
+ ):
660
+ # "_unbounded_foreach" is a special flag to indicate that the transition
661
+ # is an unbounded foreach.
662
+ # Both parent and splitted children tasks will have this flag set.
663
+ # The splitted control/mapper tasks
664
+ # are not foreach types because UBF is always followed by a join step.
665
+ is_ubf_task = (
666
+ "_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
667
+ ) and (self._graph[step_name].type != "foreach")
668
+
669
+ is_ubf_control_task = task_ds.pathspec in ubf_control_tasks
670
+
671
+ is_ubf_mapper_task = is_ubf_task and (not is_ubf_control_task)
672
+
673
+ if is_ubf_mapper_task and (
674
+ task_ds.pathspec not in ubf_mapper_tasks_to_clone
675
+ ):
676
+ # Skip copying UBF mapper tasks if control task is incomplete.
677
+ continue
678
+
679
+ ubf_context = None
680
+ if is_ubf_task:
681
+ ubf_context = "ubf_test" if is_ubf_mapper_task else "ubf_control"
682
+
683
+ finished_tuple = tuple(
684
+ [s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
685
+ )
686
+ iteration_tuple = tuple(task_ds.get("_iteration_stack", ()))
687
+ cloned_task_pathspec_index = pathspec_index.split("/")[1]
688
+ if task_ds.get("_control_task_is_mapper_zero", False):
689
+ # Replace None with index 0 for control task as it is part of the
690
+ # UBF (as a mapper as well)
691
+ finished_tuple = finished_tuple[:-1] + (
692
+ finished_tuple[-1]._replace(index=0),
693
+ )
694
+ # We need this reverse override though because when we check
695
+ # if a task has been cloned in _queue_push, the index will be None
696
+ # because the _control_task_is_mapper_zero is set in the control
697
+ # task *itself* and *not* in the one that is launching the UBF nest.
698
+ # This means that _translate_index will use None.
699
+ cloned_task_pathspec_index = re.sub(
700
+ r"(\[(?:\d+, ?)*)0\]",
701
+ lambda m: (m.group(1) or "[") + "None]",
702
+ cloned_task_pathspec_index,
703
+ )
704
+
705
+ inputs.append(
706
+ (
707
+ step_name,
708
+ task_id,
709
+ pathspec_index,
710
+ cloned_task_pathspec_index,
711
+ finished_tuple,
712
+ iteration_tuple,
713
+ is_ubf_mapper_task,
714
+ ubf_context,
715
+ )
716
+ )
287
717
 
288
718
  with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
289
719
  all_tasks = [
290
- executor.submit(self.clone_task, step_name, task_id)
291
- for (step_name, task_id) in inputs
720
+ executor.submit(
721
+ self.clone_task,
722
+ step_name,
723
+ task_id,
724
+ pathspec_index,
725
+ cloned_task_pathspec_index,
726
+ finished_tuple,
727
+ iteration_tuple,
728
+ ubf_context=ubf_context,
729
+ generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
730
+ verbose=verbose,
731
+ )
732
+ for (
733
+ step_name,
734
+ task_id,
735
+ pathspec_index,
736
+ cloned_task_pathspec_index,
737
+ finished_tuple,
738
+ iteration_tuple,
739
+ is_ubf_mapper_task,
740
+ ubf_context,
741
+ ) in inputs
292
742
  ]
293
743
  _, _ = futures.wait(all_tasks)
294
- self._logger("Cloning original run is done", system_msg=True)
744
+ self._logger(
745
+ "{}/{} cloned!".format(self._flow.name, self._clone_run_id), system_msg=True
746
+ )
295
747
  self._params_task.mark_resume_done()
296
- self._metadata.stop_heartbeat()
297
748
 
298
749
  def execute(self):
299
- (
300
- should_skip_clone_only_execution,
301
- skip_reason,
302
- ) = self._should_skip_clone_only_execution()
303
- if should_skip_clone_only_execution:
304
- self._logger(skip_reason, system_msg=True)
305
- return
306
- self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
307
-
308
- if self._params_task:
309
- self._queue_push("start", {"input_paths": [self._params_task.path]})
750
+ if len(self._cloned_tasks) > 0:
751
+ # mutable list storing the cloned tasks.
752
+ self._run_queue = []
753
+ self._active_tasks[0] = 0
310
754
  else:
311
- self._queue_push("start", {})
755
+ if self._params_task:
756
+ self._queue_push("start", {"input_paths": [self._params_task.path]})
757
+ else:
758
+ self._queue_push("start", {})
312
759
 
313
760
  progress_tstamp = time.time()
314
- try:
315
- # main scheduling loop
316
- exception = None
317
- while self._run_queue or self._active_tasks[0] > 0:
318
-
319
- # 1. are any of the current workers finished?
320
- finished_tasks = list(self._poll_workers())
321
- # 2. push new tasks triggered by the finished tasks to the queue
322
- self._queue_tasks(finished_tasks)
323
- # 3. if there are available worker slots, pop and start tasks
324
- # from the queue.
325
- self._launch_workers()
326
-
327
- if time.time() - progress_tstamp > PROGRESS_INTERVAL:
328
- progress_tstamp = time.time()
329
- tasks_print = ", ".join(
330
- [
331
- "%s (%d running; %d done)" % (k, v[0], v[1])
332
- for k, v in self._active_tasks.items()
333
- if k != 0 and v[0] > 0
334
- ]
335
- )
336
- if self._active_tasks[0] == 0:
337
- msg = "No tasks are running."
338
- else:
339
- if self._active_tasks[0] == 1:
340
- msg = "1 task is running: "
341
- else:
342
- msg = "%d tasks are running: " % self._active_tasks[0]
343
- msg += "%s." % tasks_print
344
-
345
- self._logger(msg, system_msg=True)
761
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
762
+ # Configurations are passed through a file to avoid overloading the
763
+ # command-line. We only need to create this file once and it can be reused
764
+ # for any task launch
765
+ config_value = dump_config_values(self._flow)
766
+ if config_value:
767
+ json.dump(config_value, config_file)
768
+ config_file.flush()
769
+ self._config_file_name = config_file.name
770
+ else:
771
+ self._config_file_name = None
772
+ try:
773
+ # main scheduling loop
774
+ exception = None
775
+ while (
776
+ self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks
777
+ ):
778
+ # 1. are any of the current workers finished?
779
+ if self._cloned_tasks:
780
+ finished_tasks = []
781
+
782
+ # For loops (right now just recursive steps), we need to find
783
+ # the exact frontier because if we queue all "successors" to all
784
+ # the finished iterations, we would incorrectly launch multiple
785
+ # successors. We therefore have to strip out all non-last
786
+ # iterations *per* foreach branch.
787
+ idx_per_finished_id = (
788
+ {}
789
+ ) # type: Dict[Tuple[str, Tuple[int, ...], Tuple[int, Tuple[int, ...]]]]
790
+ for task in self._cloned_tasks:
791
+ step_name, foreach_stack, iteration_stack = task.finished_id
792
+ existing_task_idx = idx_per_finished_id.get(
793
+ (step_name, foreach_stack), None
794
+ )
795
+ if existing_task_idx is not None:
796
+ len_diff = len(iteration_stack) - len(
797
+ existing_task_idx[1]
798
+ )
799
+ # In this case, we need to keep only the latest iteration
800
+ if (
801
+ len_diff == 0
802
+ and iteration_stack > existing_task_idx[1]
803
+ ) or len_diff == -1:
804
+ # We remove the one we currently have and replace
805
+ # by this one. The second option means that we are
806
+ # adding the finished iteration marker.
807
+ existing_task = finished_tasks[existing_task_idx[0]]
808
+ # These are the first two lines of _queue_tasks
809
+ # We still consider the tasks finished so we need
810
+ # to update state to be clean.
811
+ self._finished[existing_task.finished_id] = (
812
+ existing_task.path
813
+ )
814
+ self._is_cloned[existing_task.path] = (
815
+ existing_task.is_cloned
816
+ )
817
+
818
+ finished_tasks[existing_task_idx[0]] = task
819
+ idx_per_finished_id[(step_name, foreach_stack)] = (
820
+ existing_task_idx[0],
821
+ iteration_stack,
822
+ )
823
+ elif (
824
+ len_diff == 0
825
+ and iteration_stack < existing_task_idx[1]
826
+ ) or len_diff == 1:
827
+ # The second option is when we have already marked
828
+ # the end of the iteration in self._finished and
829
+ # are now seeing a previous iteration.
830
+ # We just mark the task as finished but we don't
831
+ # put it in the finished_tasks list to pass to
832
+ # the _queue_tasks function
833
+ self._finished[task.finished_id] = task.path
834
+ self._is_cloned[task.path] = task.is_cloned
835
+ else:
836
+ raise MetaflowInternalError(
837
+ "Unexpected recursive cloned tasks -- "
838
+ "this is a bug, please report it."
839
+ )
840
+ else:
841
+ # New entry
842
+ finished_tasks.append(task)
843
+ idx_per_finished_id[(step_name, foreach_stack)] = (
844
+ len(finished_tasks) - 1,
845
+ iteration_stack,
846
+ )
346
847
 
347
- if len(self._run_queue) == 0:
348
- msg = "No tasks are waiting in the queue."
848
+ # reset the list of cloned tasks and let poll_workers handle
849
+ # the remaining transition
850
+ self._cloned_tasks = []
349
851
  else:
350
- if len(self._run_queue) == 1:
351
- msg = "1 task is waiting in the queue: "
852
+ finished_tasks = list(self._poll_workers())
853
+ # 2. push new tasks triggered by the finished tasks to the queue
854
+ self._queue_tasks(finished_tasks)
855
+ # 3. if there are available worker slots, pop and start tasks
856
+ # from the queue.
857
+ self._launch_workers()
858
+
859
+ if time.time() - progress_tstamp > PROGRESS_INTERVAL:
860
+ progress_tstamp = time.time()
861
+ tasks_print = ", ".join(
862
+ [
863
+ "%s (%d running; %d done)" % (k, v[0], v[1])
864
+ for k, v in self._active_tasks.items()
865
+ if k != 0 and v[0] > 0
866
+ ]
867
+ )
868
+ if self._active_tasks[0] == 0:
869
+ msg = "No tasks are running."
352
870
  else:
353
- msg = "%d tasks are waiting in the queue." % len(
354
- self._run_queue
355
- )
871
+ if self._active_tasks[0] == 1:
872
+ msg = "1 task is running: "
873
+ else:
874
+ msg = "%d tasks are running: " % self._active_tasks[0]
875
+ msg += "%s." % tasks_print
356
876
 
357
- self._logger(msg, system_msg=True)
358
- if len(self._unprocessed_steps) > 0:
359
- if len(self._unprocessed_steps) == 1:
360
- msg = "%s step has not started" % (
361
- next(iter(self._unprocessed_steps)),
362
- )
363
- else:
364
- msg = "%d steps have not started: " % len(
365
- self._unprocessed_steps
366
- )
367
- msg += "%s." % ", ".join(self._unprocessed_steps)
368
877
  self._logger(msg, system_msg=True)
369
878
 
370
- except KeyboardInterrupt as ex:
371
- self._logger("Workflow interrupted.", system_msg=True, bad=True)
372
- self._killall()
373
- exception = ex
374
- raise
375
- except Exception as ex:
376
- self._logger("Workflow failed.", system_msg=True, bad=True)
377
- self._killall()
378
- exception = ex
379
- raise
380
- finally:
381
- # on finish clean tasks
382
- for step in self._flow:
383
- for deco in step.decorators:
384
- deco.runtime_finished(exception)
879
+ if len(self._run_queue) == 0:
880
+ msg = "No tasks are waiting in the queue."
881
+ else:
882
+ if len(self._run_queue) == 1:
883
+ msg = "1 task is waiting in the queue: "
884
+ else:
885
+ msg = "%d tasks are waiting in the queue." % len(
886
+ self._run_queue
887
+ )
385
888
 
386
- self._metadata.stop_heartbeat()
889
+ self._logger(msg, system_msg=True)
890
+ if len(self._unprocessed_steps) > 0:
891
+ if len(self._unprocessed_steps) == 1:
892
+ msg = "%s step has not started" % (
893
+ next(iter(self._unprocessed_steps)),
894
+ )
895
+ else:
896
+ msg = "%d steps have not started: " % len(
897
+ self._unprocessed_steps
898
+ )
899
+ msg += "%s." % ", ".join(self._unprocessed_steps)
900
+ self._logger(msg, system_msg=True)
901
+
902
+ except KeyboardInterrupt as ex:
903
+ self._logger("Workflow interrupted.", system_msg=True, bad=True)
904
+ self._killall()
905
+ exception = ex
906
+ raise
907
+ except Exception as ex:
908
+ self._logger("Workflow failed.", system_msg=True, bad=True)
909
+ self._killall()
910
+ exception = ex
911
+ raise
912
+ finally:
913
+ # on finish clean tasks
914
+ if not self._skip_decorator_hooks:
915
+ for step in self._flow:
916
+ for deco in step.decorators:
917
+ deco.runtime_finished(exception)
918
+ self._run_exit_hooks()
387
919
 
388
920
  # assert that end was executed and it was successful
389
- if ("end", ()) in self._finished:
921
+ if ("end", (), ()) in self._finished:
390
922
  if self._run_url:
391
923
  self._logger(
392
924
  "Done! See the run in the UI at %s" % self._run_url,
@@ -406,6 +938,51 @@ class NativeRuntime(object):
406
938
  "The *end* step was not successful by the end of flow."
407
939
  )
408
940
 
941
+ def _run_exit_hooks(self):
942
+ try:
943
+ flow_decos = self._flow._flow_state[FlowStateItems.FLOW_DECORATORS]
944
+ exit_hook_decos = flow_decos.get("exit_hook", [])
945
+ if not exit_hook_decos:
946
+ return
947
+
948
+ successful = ("end", (), ()) in self._finished or self._clone_only
949
+ pathspec = f"{self._graph.name}/{self._run_id}"
950
+ flow_file = self._environment.get_environment_info()["script"]
951
+
952
+ def _call(fn_name):
953
+ try:
954
+ result = (
955
+ subprocess.check_output(
956
+ args=[
957
+ sys.executable,
958
+ "-m",
959
+ "metaflow.plugins.exit_hook.exit_hook_script",
960
+ flow_file,
961
+ fn_name,
962
+ pathspec,
963
+ ],
964
+ env=os.environ,
965
+ )
966
+ .decode()
967
+ .strip()
968
+ )
969
+ print(result)
970
+ except subprocess.CalledProcessError as e:
971
+ print(f"[exit_hook] Hook '{fn_name}' failed with error: {e}")
972
+ except Exception as e:
973
+ print(f"[exit_hook] Unexpected error in hook '{fn_name}': {e}")
974
+
975
+ # Call all exit hook functions regardless of individual failures
976
+ for fn_name in [
977
+ name
978
+ for deco in exit_hook_decos
979
+ for name in (deco.success_hooks if successful else deco.error_hooks)
980
+ ]:
981
+ _call(fn_name)
982
+
983
+ except Exception as ex:
984
+ pass # do not fail due to exit hooks for whatever reason.
985
+
409
986
  def _killall(self):
410
987
  # If we are here, all children have received a signal and are shutting down.
411
988
  # We want to give them an opportunity to do so and then kill
@@ -434,9 +1011,88 @@ class NativeRuntime(object):
434
1011
  for _ in range(3):
435
1012
  list(self._poll_workers())
436
1013
 
1014
+ # Given the current task information (task_index), the type of transition,
1015
+ # and the split index, return the new task index.
1016
+ def _translate_index(
1017
+ self, task, next_step, type, split_index=None, loop_mode=LoopBehavior.NONE
1018
+ ):
1019
+ match = re.match(r"^(.+)\[(.*)\]\[(.*)\]$", task.task_index)
1020
+ old_match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
1021
+ if match:
1022
+ _, foreach_index, iteration_index = match.groups()
1023
+ # Convert foreach_index to a list of integers
1024
+ if len(foreach_index) > 0:
1025
+ foreach_index = foreach_index.split(",")
1026
+ else:
1027
+ foreach_index = []
1028
+ # Ditto for iteration_index
1029
+ if len(iteration_index) > 0:
1030
+ iteration_index = iteration_index.split(",")
1031
+ else:
1032
+ iteration_index = []
1033
+ elif old_match:
1034
+ _, foreach_index = old_match.groups()
1035
+ # Convert foreach_index to a list of integers
1036
+ if len(foreach_index) > 0:
1037
+ foreach_index = foreach_index.split(",")
1038
+ else:
1039
+ foreach_index = []
1040
+ # Legacy case fallback. No iteration index exists for these runs.
1041
+ iteration_index = []
1042
+ else:
1043
+ raise ValueError(
1044
+ "Index not in the format of {run_id}/{step_name}[{foreach_index}][{iteration_index}]"
1045
+ )
1046
+ if loop_mode == LoopBehavior.NONE:
1047
+ # Check if we are entering a looping construct. Right now, only recursive
1048
+ # steps are looping constructs
1049
+ next_step_node = self._graph[next_step]
1050
+ if (
1051
+ next_step_node.type == "split-switch"
1052
+ and next_step in next_step_node.out_funcs
1053
+ ):
1054
+ loop_mode = LoopBehavior.ENTERING
1055
+
1056
+ # Update iteration_index
1057
+ if loop_mode == LoopBehavior.ENTERING:
1058
+ # We are entering a loop, so we add a new iteration level
1059
+ iteration_index.append("0")
1060
+ elif loop_mode == LoopBehavior.EXITING:
1061
+ iteration_index = iteration_index[:-1]
1062
+ elif loop_mode == LoopBehavior.LOOPING:
1063
+ if len(iteration_index) == 0:
1064
+ raise MetaflowInternalError(
1065
+ "In looping mode but there is no iteration index"
1066
+ )
1067
+ iteration_index[-1] = str(int(iteration_index[-1]) + 1)
1068
+ iteration_index = ",".join(iteration_index)
1069
+
1070
+ if type == "linear":
1071
+ return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
1072
+ elif type == "join":
1073
+ indices = []
1074
+ if len(foreach_index) > 0:
1075
+ indices = foreach_index[:-1]
1076
+ return "%s[%s][%s]" % (next_step, ",".join(indices), iteration_index)
1077
+ elif type == "split":
1078
+ foreach_index.append(str(split_index))
1079
+ return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
1080
+
437
1081
  # Store the parameters needed for task creation, so that pushing on items
438
1082
  # onto the run_queue is an inexpensive operation.
439
- def _queue_push(self, step, task_kwargs):
1083
+ def _queue_push(self, step, task_kwargs, index=None):
1084
+ # In the case of cloning, we set all the cloned tasks as the
1085
+ # finished tasks when pushing tasks using _queue_tasks. This means that we
1086
+ # could potentially try to push the same task multiple times (for example
1087
+ # if multiple parents of a join are cloned). We therefore keep track of what
1088
+ # has executed (been cloned) or what has been scheduled and avoid scheduling
1089
+ # it again.
1090
+ if index:
1091
+ if index in self._ran_or_scheduled_task_index:
1092
+ # It has already run or been scheduled
1093
+ return
1094
+ # Note that we are scheduling this to run
1095
+ self._ran_or_scheduled_task_index.add(index)
440
1096
  self._run_queue.insert(0, (step, task_kwargs))
441
1097
  # For foreaches, this will happen multiple time but is ok, becomes a no-op
442
1098
  self._unprocessed_steps.discard(step)
@@ -495,34 +1151,28 @@ class NativeRuntime(object):
495
1151
  )
496
1152
  num_splits = len(mapper_tasks)
497
1153
  self._control_num_splits[task.path] = num_splits
498
- if task.is_cloned:
499
- # Add mapper tasks to be cloned.
500
- for i in range(num_splits):
501
- # NOTE: For improved robustness, introduce
502
- # `clone_options` as an enum so that we can force that
503
- # clone must occur for this task.
504
- self._queue_push(
505
- task.step,
506
- {
507
- "input_paths": task.input_paths,
508
- "split_index": str(i),
509
- "ubf_context": UBF_TASK,
510
- },
511
- )
512
- else:
513
- # Update _finished since these tasks were successfully
514
- # run elsewhere so that join will be unblocked.
515
- _, foreach_stack = task.finished_id
1154
+
1155
+ # If the control task is cloned, all mapper tasks should have been cloned
1156
+ # as well, so we no longer need to handle cloning of mapper tasks in runtime.
1157
+
1158
+ # Update _finished if we are not cloned. If we were cloned, we already
1159
+ # updated _finished with the new tasks. Note that the *value* of mapper
1160
+ # tasks is incorrect and contains the pathspec of the *cloned* run
1161
+ # but we don't use it for anything. We could look to clean it up though
1162
+ if not task.is_cloned:
1163
+ _, foreach_stack, iteration_stack = task.finished_id
516
1164
  top = foreach_stack[-1]
517
1165
  bottom = list(foreach_stack[:-1])
518
1166
  for i in range(num_splits):
519
1167
  s = tuple(bottom + [top._replace(index=i)])
520
- self._finished[(task.step, s)] = mapper_tasks[i]
1168
+ self._finished[(task.step, s, iteration_stack)] = mapper_tasks[
1169
+ i
1170
+ ]
521
1171
  self._is_cloned[mapper_tasks[i]] = False
522
1172
 
523
1173
  # Find and check status of control task and retrieve its pathspec
524
1174
  # for retrieving unbounded foreach cardinality.
525
- _, foreach_stack = task.finished_id
1175
+ _, foreach_stack, iteration_stack = task.finished_id
526
1176
  top = foreach_stack[-1]
527
1177
  bottom = list(foreach_stack[:-1])
528
1178
  s = tuple(bottom + [top._replace(index=None)])
@@ -531,7 +1181,7 @@ class NativeRuntime(object):
531
1181
  # it will have index=0 instead of index=None.
532
1182
  if task.results.get("_control_task_is_mapper_zero", False):
533
1183
  s = tuple(bottom + [top._replace(index=0)])
534
- control_path = self._finished.get((task.step, s))
1184
+ control_path = self._finished.get((task.step, s, iteration_stack))
535
1185
  if control_path:
536
1186
  # Control task was successful.
537
1187
  # Additionally check the state of (sibling) mapper tasks as well
@@ -540,21 +1190,27 @@ class NativeRuntime(object):
540
1190
  required_tasks = []
541
1191
  for i in range(num_splits):
542
1192
  s = tuple(bottom + [top._replace(index=i)])
543
- required_tasks.append(self._finished.get((task.step, s)))
1193
+ required_tasks.append(
1194
+ self._finished.get((task.step, s, iteration_stack))
1195
+ )
544
1196
 
545
1197
  if all(required_tasks):
1198
+ index = self._translate_index(task, next_step, "join")
546
1199
  # all tasks to be joined are ready. Schedule the next join step.
547
1200
  self._queue_push(
548
1201
  next_step,
549
1202
  {"input_paths": required_tasks, "join_type": "foreach"},
1203
+ index,
550
1204
  )
551
1205
  else:
552
1206
  # matching_split is the split-parent of the finished task
553
1207
  matching_split = self._graph[self._graph[next_step].split_parents[-1]]
554
- _, foreach_stack = task.finished_id
1208
+ _, foreach_stack, iteration_stack = task.finished_id
555
1209
 
1210
+ direct_parents = set(self._graph[next_step].in_funcs)
1211
+
1212
+ # next step is a foreach join
556
1213
  if matching_split.type == "foreach":
557
- # next step is a foreach join
558
1214
 
559
1215
  def siblings(foreach_stack):
560
1216
  top = foreach_stack[-1]
@@ -563,27 +1219,57 @@ class NativeRuntime(object):
563
1219
  yield tuple(bottom + [top._replace(index=index)])
564
1220
 
565
1221
  # required tasks are all split-siblings of the finished task
566
- required_tasks = [
567
- self._finished.get((task.step, s)) for s in siblings(foreach_stack)
568
- ]
1222
+ required_tasks = list(
1223
+ filter(
1224
+ lambda x: x is not None,
1225
+ [
1226
+ self._finished.get((p, s, iteration_stack))
1227
+ for p in direct_parents
1228
+ for s in siblings(foreach_stack)
1229
+ ],
1230
+ )
1231
+ )
1232
+ required_count = task.finished_id[1][-1].num_splits
569
1233
  join_type = "foreach"
1234
+ index = self._translate_index(task, next_step, "join")
570
1235
  else:
571
1236
  # next step is a split
572
- # required tasks are all branches joined by the next step
573
- required_tasks = [
574
- self._finished.get((step, foreach_stack))
575
- for step in self._graph[next_step].in_funcs
576
- ]
577
- join_type = "linear"
1237
+ required_tasks = list(
1238
+ filter(
1239
+ lambda x: x is not None,
1240
+ [
1241
+ self._finished.get((p, foreach_stack, iteration_stack))
1242
+ for p in direct_parents
1243
+ ],
1244
+ )
1245
+ )
578
1246
 
579
- if all(required_tasks):
580
- # all tasks to be joined are ready. Schedule the next join step.
1247
+ required_count = len(matching_split.out_funcs)
1248
+ join_type = "linear"
1249
+ index = self._translate_index(task, next_step, "linear")
1250
+ if len(required_tasks) == required_count:
1251
+ # We have all the required previous tasks to schedule a join
581
1252
  self._queue_push(
582
- next_step, {"input_paths": required_tasks, "join_type": join_type}
1253
+ next_step,
1254
+ {"input_paths": required_tasks, "join_type": join_type},
1255
+ index,
583
1256
  )
584
1257
 
585
- def _queue_task_foreach(self, task, next_steps):
1258
+ def _queue_task_switch(self, task, next_steps, is_recursive):
1259
+ chosen_step = next_steps[0]
1260
+
1261
+ loop_mode = LoopBehavior.NONE
1262
+ if is_recursive:
1263
+ if chosen_step != task.step:
1264
+ # We are exiting a loop
1265
+ loop_mode = LoopBehavior.EXITING
1266
+ else:
1267
+ # We are staying in the loop
1268
+ loop_mode = LoopBehavior.LOOPING
1269
+ index = self._translate_index(task, chosen_step, "linear", None, loop_mode)
1270
+ self._queue_push(chosen_step, {"input_paths": [task.path]}, index)
586
1271
 
1272
+ def _queue_task_foreach(self, task, next_steps):
587
1273
  # CHECK: this condition should be enforced by the linter but
588
1274
  # let's assert that the assumption holds
589
1275
  if len(next_steps) > 1:
@@ -601,6 +1287,12 @@ class NativeRuntime(object):
601
1287
  # Need to push control process related task.
602
1288
  ubf_iter_name = task.results.get("_foreach_var")
603
1289
  ubf_iter = task.results.get(ubf_iter_name)
1290
+ # UBF control task has no split index, hence "None" as place holder.
1291
+
1292
+ if task.results.get("_control_task_is_mapper_zero", False):
1293
+ index = self._translate_index(task, next_step, "split", 0)
1294
+ else:
1295
+ index = self._translate_index(task, next_step, "split", None)
604
1296
  self._queue_push(
605
1297
  next_step,
606
1298
  {
@@ -608,6 +1300,7 @@ class NativeRuntime(object):
608
1300
  "ubf_context": UBF_CONTROL,
609
1301
  "ubf_iter": ubf_iter,
610
1302
  },
1303
+ index,
611
1304
  )
612
1305
  else:
613
1306
  num_splits = task.results["_foreach_num_splits"]
@@ -627,8 +1320,11 @@ class NativeRuntime(object):
627
1320
 
628
1321
  # schedule all splits
629
1322
  for i in range(num_splits):
1323
+ index = self._translate_index(task, next_step, "split", i)
630
1324
  self._queue_push(
631
- next_step, {"split_index": str(i), "input_paths": [task.path]}
1325
+ next_step,
1326
+ {"split_index": str(i), "input_paths": [task.path]},
1327
+ index,
632
1328
  )
633
1329
 
634
1330
  def _queue_tasks(self, finished_tasks):
@@ -649,7 +1345,39 @@ class NativeRuntime(object):
649
1345
  next_steps = []
650
1346
  foreach = None
651
1347
  expected = self._graph[task.step].out_funcs
652
- if next_steps != expected:
1348
+
1349
+ if self._graph[task.step].type == "split-switch":
1350
+ is_recursive = task.step in self._graph[task.step].out_funcs
1351
+ if len(next_steps) != 1:
1352
+ msg = (
1353
+ "Switch step *{step}* should transition to exactly "
1354
+ "one step at runtime, but got: {actual}"
1355
+ )
1356
+ raise MetaflowInternalError(
1357
+ msg.format(step=task.step, actual=", ".join(next_steps))
1358
+ )
1359
+ if next_steps[0] not in expected:
1360
+ msg = (
1361
+ "Switch step *{step}* transitioned to unexpected "
1362
+ "step *{actual}*. Expected one of: {expected}"
1363
+ )
1364
+ raise MetaflowInternalError(
1365
+ msg.format(
1366
+ step=task.step,
1367
+ actual=next_steps[0],
1368
+ expected=", ".join(expected),
1369
+ )
1370
+ )
1371
+ # When exiting a recursive loop, we mark that the loop itself has
1372
+ # finished by adding a special entry in self._finished which has
1373
+ # an iteration stack that is shorter (ie: we are out of the loop) so
1374
+ # that we can then find it when looking at successor tasks to launch.
1375
+ if is_recursive and next_steps[0] != task.step:
1376
+ step_name, finished_tuple, iteration_tuple = task.finished_id
1377
+ self._finished[
1378
+ (step_name, finished_tuple, iteration_tuple[:-1])
1379
+ ] = task.path
1380
+ elif next_steps != expected:
653
1381
  msg = (
654
1382
  "Based on static analysis of the code, step *{step}* "
655
1383
  "was expected to transition to step(s) *{expected}*. "
@@ -673,10 +1401,14 @@ class NativeRuntime(object):
673
1401
  elif foreach:
674
1402
  # Next step is a foreach child
675
1403
  self._queue_task_foreach(task, next_steps)
1404
+ elif self._graph[task.step].type == "split-switch":
1405
+ # Current step is switch - queue the chosen step
1406
+ self._queue_task_switch(task, next_steps, is_recursive)
676
1407
  else:
677
1408
  # Next steps are normal linear steps
678
1409
  for step in next_steps:
679
- self._queue_push(step, {"input_paths": [task.path]})
1410
+ index = self._translate_index(task, step, "linear")
1411
+ self._queue_push(step, {"input_paths": [task.path]}, index)
680
1412
 
681
1413
  def _poll_workers(self):
682
1414
  if self._workers:
@@ -728,6 +1460,22 @@ class NativeRuntime(object):
728
1460
  # Initialize the task (which can be expensive using remote datastores)
729
1461
  # before launching the worker so that cost is amortized over time, instead
730
1462
  # of doing it during _queue_push.
1463
+ if (
1464
+ FEAT_ALWAYS_UPLOAD_CODE_PACKAGE
1465
+ and "METAFLOW_CODE_SHA" not in os.environ
1466
+ ):
1467
+ # We check if the code package is uploaded and, if so, we set the
1468
+ # environment variables that will cause the metadata service to
1469
+ # register the code package with the task created in _new_task below
1470
+ code_sha = self._package.package_sha(timeout=0.01)
1471
+ if code_sha:
1472
+ os.environ["METAFLOW_CODE_SHA"] = code_sha
1473
+ os.environ["METAFLOW_CODE_URL"] = self._package.package_url()
1474
+ os.environ["METAFLOW_CODE_DS"] = self._flow_datastore.TYPE
1475
+ os.environ["METAFLOW_CODE_METADATA"] = (
1476
+ self._package.package_metadata
1477
+ )
1478
+
731
1479
  task = self._new_task(step, **task_kwargs)
732
1480
  self._launch_worker(task)
733
1481
 
@@ -755,7 +1503,7 @@ class NativeRuntime(object):
755
1503
  )
756
1504
  return
757
1505
 
758
- worker = Worker(task, self._max_log_size)
1506
+ worker = Worker(task, self._max_log_size, self._config_file_name)
759
1507
  for fd in worker.fds():
760
1508
  self._workers[fd] = worker
761
1509
  self._poll.add(fd)
@@ -797,9 +1545,10 @@ class Task(object):
797
1545
  join_type=None,
798
1546
  task_id=None,
799
1547
  resume_identifier=None,
1548
+ pathspec_index=None,
800
1549
  ):
801
-
802
1550
  self.step = step
1551
+ self.flow = flow
803
1552
  self.flow_name = flow.name
804
1553
  self.run_id = run_id
805
1554
  self.task_id = None
@@ -839,10 +1588,9 @@ class Task(object):
839
1588
  self._is_resume_leader = None
840
1589
  self._resume_done = None
841
1590
  self._resume_identifier = resume_identifier
842
-
843
1591
  origin = None
844
1592
  if clone_run_id and may_clone:
845
- origin = self._find_origin_task(clone_run_id, join_type)
1593
+ origin = self._find_origin_task(clone_run_id, join_type, pathspec_index)
846
1594
  if origin and origin["_task_ok"]:
847
1595
  # At this point, we know we are going to clone
848
1596
  self._is_cloned = True
@@ -934,8 +1682,7 @@ class Task(object):
934
1682
  # To avoid the edge case where the resume leader is selected but has not
935
1683
  # yet written the _resume_leader metadata, we will wait for a few seconds.
936
1684
  # We will wait for resume leader for at most 3 times.
937
- for resume_leader_wait_retry in range(3):
938
-
1685
+ for _ in range(3):
939
1686
  if ds.has_metadata("_resume_leader", add_attempt=False):
940
1687
  resume_leader = ds.load_metadata(
941
1688
  ["_resume_leader"], add_attempt=False
@@ -964,10 +1711,11 @@ class Task(object):
964
1711
  )
965
1712
 
966
1713
  if self._is_resume_leader:
967
- self.log(
968
- "Selected as the reentrant clone leader.",
969
- system_msg=True,
970
- )
1714
+ if reentrant:
1715
+ self.log(
1716
+ "Selected as the reentrant clone leader.",
1717
+ system_msg=True,
1718
+ )
971
1719
  # Clone in place without relying on run_queue.
972
1720
  self.new_attempt()
973
1721
  self._ds.clone(origin)
@@ -1016,13 +1764,13 @@ class Task(object):
1016
1764
  self._should_skip_cloning = task_completed
1017
1765
  if self._should_skip_cloning:
1018
1766
  self.log(
1019
- "Skip cloning of previously run task %s" % self.clone_origin,
1767
+ "Skipping cloning of previously run task %s"
1768
+ % self.clone_origin,
1020
1769
  system_msg=True,
1021
1770
  )
1022
1771
  else:
1023
1772
  self.log(
1024
- "Cloning results of a previously run task %s"
1025
- % self.clone_origin,
1773
+ "Cloning previously run task %s" % self.clone_origin,
1026
1774
  system_msg=True,
1027
1775
  )
1028
1776
  else:
@@ -1035,7 +1783,6 @@ class Task(object):
1035
1783
  # Open the output datastore only if the task is not being cloned.
1036
1784
  if not self._is_cloned:
1037
1785
  self.new_attempt()
1038
-
1039
1786
  for deco in decos:
1040
1787
  deco.runtime_task_created(
1041
1788
  self._ds,
@@ -1112,63 +1859,34 @@ class Task(object):
1112
1859
 
1113
1860
  def _get_task_id(self, task_id):
1114
1861
  already_existed = True
1862
+ tags = []
1115
1863
  if self.ubf_context == UBF_CONTROL:
1116
- [input_path] = self.input_paths
1117
- run, input_step, input_task = input_path.split("/")
1118
- # We associate the control task-id to be 1:1 with the split node
1119
- # where the unbounded-foreach was defined.
1120
- # We prefer encoding the corresponding split into the task_id of
1121
- # the control node; so it has access to this information quite
1122
- # easily. There is anyway a corresponding int id stored in the
1123
- # metadata backend - so this should be fine.
1124
- task_id = "control-%s-%s-%s" % (run, input_step, input_task)
1125
- # Register only regular Metaflow (non control) tasks.
1864
+ tags = [CONTROL_TASK_TAG]
1865
+ # Register Metaflow tasks.
1126
1866
  if task_id is None:
1127
- task_id = str(self.metadata.new_task_id(self.run_id, self.step))
1867
+ task_id = str(
1868
+ self.metadata.new_task_id(self.run_id, self.step, sys_tags=tags)
1869
+ )
1128
1870
  already_existed = False
1129
1871
  else:
1130
- # task_id is preset only by persist_constants() or control tasks.
1131
- if self.ubf_context == UBF_CONTROL:
1132
- tags = [CONTROL_TASK_TAG]
1133
- attempt_id = 0
1134
- already_existed = not self.metadata.register_task_id(
1135
- self.run_id,
1136
- self.step,
1137
- task_id,
1138
- attempt_id,
1139
- sys_tags=tags,
1140
- )
1141
- # A Task's tags are now those of its ancestral Run, so we are not able
1142
- # to rely on a task's tags to indicate the presence of a control task
1143
- # so, on top of adding the tags above, we also add a task metadata
1144
- # entry indicating that this is a "control task".
1145
- #
1146
- # Here we will also add a task metadata entry to indicate "control task".
1147
- # Within the metaflow repo, the only dependency of such a "control task"
1148
- # indicator is in the integration test suite (see Step.control_tasks() in
1149
- # client API).
1150
- task_metadata_list = [
1151
- MetaDatum(
1152
- field="internal_task_type",
1153
- value=CONTROL_TASK_TAG,
1154
- type="internal_task_type",
1155
- tags=["attempt_id:{0}".format(attempt_id)],
1156
- )
1157
- ]
1158
- self.metadata.register_metadata(
1159
- self.run_id, self.step, task_id, task_metadata_list
1160
- )
1161
- else:
1162
- already_existed = not self.metadata.register_task_id(
1163
- self.run_id, self.step, task_id, 0
1164
- )
1872
+ # task_id is preset only by persist_constants().
1873
+ already_existed = not self.metadata.register_task_id(
1874
+ self.run_id,
1875
+ self.step,
1876
+ task_id,
1877
+ 0,
1878
+ sys_tags=tags,
1879
+ )
1165
1880
 
1166
1881
  self.task_id = task_id
1167
1882
  self._path = "%s/%s/%s" % (self.run_id, self.step, self.task_id)
1168
1883
  return already_existed
1169
1884
 
1170
- def _find_origin_task(self, clone_run_id, join_type):
1171
- if self.step == "_parameters":
1885
+ def _find_origin_task(self, clone_run_id, join_type, pathspec_index=None):
1886
+ if pathspec_index:
1887
+ origin = self.origin_ds_set.get_with_pathspec_index(pathspec_index)
1888
+ return origin
1889
+ elif self.step == "_parameters":
1172
1890
  pathspec = "%s/_parameters[]" % clone_run_id
1173
1891
  origin = self.origin_ds_set.get_with_pathspec_index(pathspec)
1174
1892
 
@@ -1218,16 +1936,23 @@ class Task(object):
1218
1936
  )
1219
1937
  return self._results_ds
1220
1938
 
1939
+ @property
1940
+ def task_index(self):
1941
+ _, task_index = self.results.pathspec_index.split("/")
1942
+ return task_index
1943
+
1221
1944
  @property
1222
1945
  def finished_id(self):
1223
1946
  # note: id is not available before the task has finished.
1224
- # Index already identifies the task within the foreach,
1225
- # we will remove foreach value so that it is easier to
1947
+ # Index already identifies the task within the foreach and loop.
1948
+ # We will remove foreach value so that it is easier to
1226
1949
  # identify siblings within a foreach.
1227
1950
  foreach_stack_tuple = tuple(
1228
1951
  [s._replace(value=0) for s in self.results["_foreach_stack"]]
1229
1952
  )
1230
- return (self.step, foreach_stack_tuple)
1953
+ # _iteration_stack requires a fallback, as it does not exist for runs before v2.17.4
1954
+ iteration_stack_tuple = tuple(self.results.get("_iteration_stack", []))
1955
+ return (self.step, foreach_stack_tuple, iteration_stack_tuple)
1231
1956
 
1232
1957
  @property
1233
1958
  def is_cloned(self):
@@ -1301,9 +2026,29 @@ class CLIArgs(object):
1301
2026
  for step execution in StepDecorator.runtime_step_cli().
1302
2027
  """
1303
2028
 
1304
- def __init__(self, task):
2029
+ def __init__(
2030
+ self,
2031
+ task,
2032
+ orig_flow_datastore=None,
2033
+ spin_pathspec=None,
2034
+ artifacts_module=None,
2035
+ persist=True,
2036
+ skip_decorators=False,
2037
+ ):
1305
2038
  self.task = task
2039
+ if orig_flow_datastore is not None:
2040
+ self.orig_flow_datastore = "%s@%s" % (
2041
+ orig_flow_datastore.TYPE,
2042
+ orig_flow_datastore.datastore_root,
2043
+ )
2044
+ else:
2045
+ self.orig_flow_datastore = None
2046
+ self.spin_pathspec = spin_pathspec
2047
+ self.artifacts_module = artifacts_module
2048
+ self.persist = persist
2049
+ self.skip_decorators = skip_decorators
1306
2050
  self.entrypoint = list(task.entrypoint)
2051
+ step_obj = getattr(self.task.flow, self.task.step)
1307
2052
  self.top_level_options = {
1308
2053
  "quiet": True,
1309
2054
  "metadata": self.task.metadata_type,
@@ -1315,38 +2060,77 @@ class CLIArgs(object):
1315
2060
  "datastore-root": self.task.datastore_sysroot,
1316
2061
  "with": [
1317
2062
  deco.make_decorator_spec()
1318
- for deco in self.task.decos
1319
- if not deco.statically_defined
2063
+ for deco in chain(
2064
+ self.task.decos,
2065
+ step_obj.wrappers,
2066
+ step_obj.config_decorators,
2067
+ )
2068
+ if not deco.statically_defined and deco.inserted_by is None
1320
2069
  ],
1321
2070
  }
1322
2071
 
1323
2072
  # FlowDecorators can define their own top-level options. They are
1324
2073
  # responsible for adding their own top-level options and values through
1325
2074
  # the get_top_level_options() hook.
1326
- for deco in flow_decorators():
2075
+ for deco in flow_decorators(self.task.flow):
1327
2076
  self.top_level_options.update(deco.get_top_level_options())
1328
2077
 
2078
+ # We also pass configuration options using the kv.<name> syntax which will cause
2079
+ # the configuration options to be loaded from the CONFIG file (or local-config-file
2080
+ # in the case of the local runtime)
2081
+ configs = self.task.flow._flow_state[FlowStateItems.CONFIGS]
2082
+ if configs:
2083
+ self.top_level_options["config-value"] = [
2084
+ (k, ConfigInput.make_key_name(k)) for k in configs
2085
+ ]
2086
+
2087
+ if spin_pathspec:
2088
+ self.spin_args()
2089
+ else:
2090
+ self.default_args()
2091
+
2092
+ def default_args(self):
1329
2093
  self.commands = ["step"]
1330
2094
  self.command_args = [self.task.step]
1331
2095
  self.command_options = {
1332
- "run-id": task.run_id,
1333
- "task-id": task.task_id,
1334
- "input-paths": compress_list(task.input_paths),
1335
- "split-index": task.split_index,
1336
- "retry-count": task.retries,
1337
- "max-user-code-retries": task.user_code_retries,
1338
- "tag": task.tags,
2096
+ "run-id": self.task.run_id,
2097
+ "task-id": self.task.task_id,
2098
+ "input-paths": compress_list(self.task.input_paths),
2099
+ "split-index": self.task.split_index,
2100
+ "retry-count": self.task.retries,
2101
+ "max-user-code-retries": self.task.user_code_retries,
2102
+ "tag": self.task.tags,
1339
2103
  "namespace": get_namespace() or "",
1340
- "ubf-context": task.ubf_context,
2104
+ "ubf-context": self.task.ubf_context,
1341
2105
  }
1342
2106
  self.env = {}
1343
2107
 
1344
- def get_args(self):
2108
+ def spin_args(self):
2109
+ self.commands = ["spin-step"]
2110
+ self.command_args = [self.task.step]
1345
2111
 
2112
+ self.command_options = {
2113
+ "run-id": self.task.run_id,
2114
+ "task-id": self.task.task_id,
2115
+ "input-paths": compress_list(self.task.input_paths),
2116
+ "split-index": self.task.split_index,
2117
+ "retry-count": self.task.retries,
2118
+ "max-user-code-retries": self.task.user_code_retries,
2119
+ "namespace": get_namespace() or "",
2120
+ "orig-flow-datastore": self.orig_flow_datastore,
2121
+ "artifacts-module": self.artifacts_module,
2122
+ "skip-decorators": self.skip_decorators,
2123
+ }
2124
+ if self.persist:
2125
+ self.command_options["persist"] = True
2126
+ else:
2127
+ self.command_options["no-persist"] = True
2128
+ self.env = {}
2129
+
2130
+ def get_args(self):
1346
2131
  # TODO: Make one with dict_to_cli_options; see cli_args.py for more detail
1347
2132
  def _options(mapping):
1348
2133
  for k, v in mapping.items():
1349
-
1350
2134
  # None or False arguments are ignored
1351
2135
  # v needs to be explicitly False, not falsy, e.g. 0 is an acceptable value
1352
2136
  if v is None or v is False:
@@ -1361,12 +2145,15 @@ class CLIArgs(object):
1361
2145
  for value in v:
1362
2146
  yield "--%s" % k
1363
2147
  if not isinstance(value, bool):
1364
- yield to_unicode(value)
2148
+ value = value if isinstance(value, tuple) else (value,)
2149
+ for vv in value:
2150
+ yield to_unicode(vv)
1365
2151
 
1366
2152
  args = list(self.entrypoint)
1367
2153
  args.extend(_options(self.top_level_options))
1368
2154
  args.extend(self.commands)
1369
2155
  args.extend(self.command_args)
2156
+
1370
2157
  args.extend(_options(self.command_options))
1371
2158
  return args
1372
2159
 
@@ -1378,8 +2165,24 @@ class CLIArgs(object):
1378
2165
 
1379
2166
 
1380
2167
  class Worker(object):
1381
- def __init__(self, task, max_logs_size):
2168
+ def __init__(
2169
+ self,
2170
+ task,
2171
+ max_logs_size,
2172
+ config_file_name,
2173
+ orig_flow_datastore=None,
2174
+ spin_pathspec=None,
2175
+ artifacts_module=None,
2176
+ persist=True,
2177
+ skip_decorators=False,
2178
+ ):
1382
2179
  self.task = task
2180
+ self._config_file_name = config_file_name
2181
+ self._orig_flow_datastore = orig_flow_datastore
2182
+ self._spin_pathspec = spin_pathspec
2183
+ self._artifacts_module = artifacts_module
2184
+ self._skip_decorators = skip_decorators
2185
+ self._persist = persist
1383
2186
  self._proc = self._launch()
1384
2187
 
1385
2188
  if task.retries > task.user_code_retries:
@@ -1411,7 +2214,14 @@ class Worker(object):
1411
2214
  # not it is properly shut down)
1412
2215
 
1413
2216
  def _launch(self):
1414
- args = CLIArgs(self.task)
2217
+ args = CLIArgs(
2218
+ self.task,
2219
+ orig_flow_datastore=self._orig_flow_datastore,
2220
+ spin_pathspec=self._spin_pathspec,
2221
+ artifacts_module=self._artifacts_module,
2222
+ persist=self._persist,
2223
+ skip_decorators=self._skip_decorators,
2224
+ )
1415
2225
  env = dict(os.environ)
1416
2226
 
1417
2227
  if self.task.clone_run_id:
@@ -1431,6 +2241,12 @@ class Worker(object):
1431
2241
  self.task.user_code_retries,
1432
2242
  self.task.ubf_context,
1433
2243
  )
2244
+
2245
+ # Add user configurations using a file to avoid using up too much space on the
2246
+ # command line
2247
+ if self._config_file_name:
2248
+ args.top_level_options["local-config-file"] = self._config_file_name
2249
+ # Pass configuration options
1434
2250
  env.update(args.get_env())
1435
2251
  env["PYTHONUNBUFFERED"] = "x"
1436
2252
  tracing.inject_tracing_vars(env)
@@ -1438,6 +2254,7 @@ class Worker(object):
1438
2254
  # by read_logline() below that relies on readline() not blocking
1439
2255
  # print('running', args)
1440
2256
  cmdline = args.get_args()
2257
+ from_start(f"Command line: {' '.join(cmdline)}")
1441
2258
  debug.subcommand_exec(cmdline)
1442
2259
  return subprocess.Popen(
1443
2260
  cmdline,
@@ -1560,13 +2377,14 @@ class Worker(object):
1560
2377
  else:
1561
2378
  self.emit_log(b"Task failed.", self._stderr, system_msg=True)
1562
2379
  else:
1563
- num = self.task.results["_foreach_num_splits"]
1564
- if num:
1565
- self.task.log(
1566
- "Foreach yields %d child steps." % num,
1567
- system_msg=True,
1568
- pid=self._proc.pid,
1569
- )
2380
+ if not self._spin_pathspec:
2381
+ num = self.task.results["_foreach_num_splits"]
2382
+ if num:
2383
+ self.task.log(
2384
+ "Foreach yields %d child steps." % num,
2385
+ system_msg=True,
2386
+ pid=self._proc.pid,
2387
+ )
1570
2388
  self.task.log(
1571
2389
  "Task finished successfully.", system_msg=True, pid=self._proc.pid
1572
2390
  )