ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +10 -3
- metaflow/_vendor/imghdr/__init__.py +186 -0
- metaflow/_vendor/yaml/__init__.py +427 -0
- metaflow/_vendor/yaml/composer.py +139 -0
- metaflow/_vendor/yaml/constructor.py +748 -0
- metaflow/_vendor/yaml/cyaml.py +101 -0
- metaflow/_vendor/yaml/dumper.py +62 -0
- metaflow/_vendor/yaml/emitter.py +1137 -0
- metaflow/_vendor/yaml/error.py +75 -0
- metaflow/_vendor/yaml/events.py +86 -0
- metaflow/_vendor/yaml/loader.py +63 -0
- metaflow/_vendor/yaml/nodes.py +49 -0
- metaflow/_vendor/yaml/parser.py +589 -0
- metaflow/_vendor/yaml/reader.py +185 -0
- metaflow/_vendor/yaml/representer.py +389 -0
- metaflow/_vendor/yaml/resolver.py +227 -0
- metaflow/_vendor/yaml/scanner.py +1435 -0
- metaflow/_vendor/yaml/serializer.py +111 -0
- metaflow/_vendor/yaml/tokens.py +104 -0
- metaflow/cards.py +4 -0
- metaflow/cli.py +125 -21
- metaflow/cli_components/init_cmd.py +1 -0
- metaflow/cli_components/run_cmds.py +204 -40
- metaflow/cli_components/step_cmd.py +160 -4
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +198 -130
- metaflow/client/filecache.py +59 -32
- metaflow/cmd/code/__init__.py +2 -1
- metaflow/cmd/develop/stub_generator.py +49 -18
- metaflow/cmd/develop/stubs.py +9 -27
- metaflow/cmd/make_wrapper.py +30 -0
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +40 -9
- metaflow/datastore/datastore_set.py +10 -1
- metaflow/datastore/flow_datastore.py +124 -4
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +92 -6
- metaflow/debug.py +5 -0
- metaflow/decorators.py +331 -82
- metaflow/extension_support/__init__.py +414 -356
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/flowspec.py +322 -82
- metaflow/graph.py +178 -15
- metaflow/includefile.py +25 -3
- metaflow/lint.py +94 -3
- metaflow/meta_files.py +13 -0
- metaflow/metadata_provider/metadata.py +13 -2
- metaflow/metaflow_config.py +66 -4
- metaflow/metaflow_environment.py +91 -25
- metaflow/metaflow_profile.py +18 -0
- metaflow/metaflow_version.py +16 -1
- metaflow/package/__init__.py +673 -0
- metaflow/packaging_sys/__init__.py +880 -0
- metaflow/packaging_sys/backend.py +128 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +99 -0
- metaflow/packaging_sys/utils.py +54 -0
- metaflow/packaging_sys/v1.py +527 -0
- metaflow/parameters.py +6 -2
- metaflow/plugins/__init__.py +6 -0
- metaflow/plugins/airflow/airflow.py +11 -1
- metaflow/plugins/airflow/airflow_cli.py +16 -5
- metaflow/plugins/argo/argo_client.py +42 -20
- metaflow/plugins/argo/argo_events.py +6 -6
- metaflow/plugins/argo/argo_workflows.py +1023 -344
- metaflow/plugins/argo/argo_workflows_cli.py +396 -94
- metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
- metaflow/plugins/argo/capture_error.py +5 -2
- metaflow/plugins/argo/conditional_input_paths.py +35 -0
- metaflow/plugins/argo/exit_hooks.py +209 -0
- metaflow/plugins/argo/param_val.py +19 -0
- metaflow/plugins/aws/aws_client.py +6 -0
- metaflow/plugins/aws/aws_utils.py +33 -1
- metaflow/plugins/aws/batch/batch.py +72 -5
- metaflow/plugins/aws/batch/batch_cli.py +24 -3
- metaflow/plugins/aws/batch/batch_decorator.py +57 -6
- metaflow/plugins/aws/step_functions/step_functions.py +28 -3
- metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
- metaflow/plugins/cards/card_cli.py +20 -1
- metaflow/plugins/cards/card_creator.py +24 -1
- metaflow/plugins/cards/card_datastore.py +21 -49
- metaflow/plugins/cards/card_decorator.py +58 -6
- metaflow/plugins/cards/card_modules/basic.py +38 -9
- metaflow/plugins/cards/card_modules/bundle.css +1 -1
- metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
- metaflow/plugins/cards/card_modules/components.py +592 -3
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
- metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
- metaflow/plugins/cards/card_modules/main.css +1 -0
- metaflow/plugins/cards/card_modules/main.js +56 -41
- metaflow/plugins/cards/card_modules/test_cards.py +22 -6
- metaflow/plugins/cards/component_serializer.py +1 -8
- metaflow/plugins/cards/metadata.py +22 -0
- metaflow/plugins/catch_decorator.py +9 -0
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/s3/s3.py +49 -17
- metaflow/plugins/datatools/s3/s3op.py +113 -66
- metaflow/plugins/env_escape/client_modules.py +102 -72
- metaflow/plugins/events_decorator.py +127 -121
- metaflow/plugins/exit_hook/__init__.py +0 -0
- metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
- metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
- metaflow/plugins/kubernetes/kubernetes.py +12 -1
- metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
- metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
- metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
- metaflow/plugins/metadata_providers/local.py +76 -82
- metaflow/plugins/metadata_providers/service.py +13 -9
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/plugins/package_cli.py +36 -24
- metaflow/plugins/parallel_decorator.py +11 -2
- metaflow/plugins/parsers.py +16 -0
- metaflow/plugins/pypi/bootstrap.py +7 -1
- metaflow/plugins/pypi/conda_decorator.py +41 -82
- metaflow/plugins/pypi/conda_environment.py +14 -6
- metaflow/plugins/pypi/micromamba.py +9 -1
- metaflow/plugins/pypi/pip.py +41 -5
- metaflow/plugins/pypi/pypi_decorator.py +4 -4
- metaflow/plugins/pypi/utils.py +22 -0
- metaflow/plugins/secrets/__init__.py +3 -0
- metaflow/plugins/secrets/secrets_decorator.py +14 -178
- metaflow/plugins/secrets/secrets_func.py +49 -0
- metaflow/plugins/secrets/secrets_spec.py +101 -0
- metaflow/plugins/secrets/utils.py +74 -0
- metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
- metaflow/plugins/timeout_decorator.py +0 -1
- metaflow/plugins/uv/bootstrap.py +29 -1
- metaflow/plugins/uv/uv_environment.py +5 -3
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/click_api.py +79 -26
- metaflow/runner/deployer.py +208 -6
- metaflow/runner/deployer_impl.py +32 -12
- metaflow/runner/metaflow_runner.py +266 -33
- metaflow/runner/subprocess_manager.py +21 -1
- metaflow/runner/utils.py +27 -16
- metaflow/runtime.py +660 -66
- metaflow/task.py +255 -26
- metaflow/user_configs/config_options.py +33 -21
- metaflow/user_configs/config_parameters.py +220 -58
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +512 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +264 -0
- metaflow/user_decorators/user_step_decorator.py +749 -0
- metaflow/util.py +197 -7
- metaflow/vendor.py +23 -7
- metaflow/version.py +1 -1
- {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
- {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
- {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
- metaflow/_vendor/v3_5/__init__.py +0 -1
- metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
- metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
- metaflow/_vendor/v3_5/zipp.py +0 -329
- metaflow/info_file.py +0 -25
- metaflow/package.py +0 -203
- metaflow/user_configs/config_decorators.py +0 -568
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
- {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
metaflow/runtime.py
CHANGED
|
@@ -15,28 +15,40 @@ import tempfile
|
|
|
15
15
|
import time
|
|
16
16
|
import subprocess
|
|
17
17
|
from datetime import datetime
|
|
18
|
+
from enum import Enum
|
|
18
19
|
from io import BytesIO
|
|
20
|
+
from itertools import chain
|
|
19
21
|
from functools import partial
|
|
20
22
|
from concurrent import futures
|
|
21
23
|
|
|
24
|
+
from typing import Dict, Tuple
|
|
22
25
|
from metaflow.datastore.exceptions import DataException
|
|
23
26
|
from contextlib import contextmanager
|
|
24
27
|
|
|
25
28
|
from . import get_namespace
|
|
29
|
+
from .client.filecache import FileCache, FileBlobCache, TaskMetadataCache
|
|
26
30
|
from .metadata_provider import MetaDatum
|
|
27
|
-
from .metaflow_config import
|
|
31
|
+
from .metaflow_config import (
|
|
32
|
+
FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
|
|
33
|
+
MAX_ATTEMPTS,
|
|
34
|
+
UI_URL,
|
|
35
|
+
SPIN_ALLOWED_DECORATORS,
|
|
36
|
+
SPIN_DISALLOWED_DECORATORS,
|
|
37
|
+
)
|
|
38
|
+
from .metaflow_profile import from_start
|
|
39
|
+
from .plugins import DATASTORES
|
|
28
40
|
from .exception import (
|
|
29
41
|
MetaflowException,
|
|
30
42
|
MetaflowInternalError,
|
|
31
43
|
METAFLOW_EXIT_DISALLOW_RETRY,
|
|
32
44
|
)
|
|
33
45
|
from . import procpoll
|
|
34
|
-
from .datastore import TaskDataStoreSet
|
|
46
|
+
from .datastore import FlowDataStore, TaskDataStoreSet
|
|
35
47
|
from .debug import debug
|
|
36
48
|
from .decorators import flow_decorators
|
|
37
|
-
from .flowspec import
|
|
49
|
+
from .flowspec import FlowStateItems
|
|
38
50
|
from .mflog import mflog, RUNTIME_LOG_SOURCE
|
|
39
|
-
from .util import to_unicode, compress_list, unicode_type
|
|
51
|
+
from .util import to_unicode, compress_list, unicode_type, get_latest_task_pathspec
|
|
40
52
|
from .clone_util import clone_task_helper
|
|
41
53
|
from .unbounded_foreach import (
|
|
42
54
|
CONTROL_TASK_TAG,
|
|
@@ -59,6 +71,7 @@ PROGRESS_INTERVAL = 300 # s
|
|
|
59
71
|
# leveraging the TaskDataStoreSet.
|
|
60
72
|
PREFETCH_DATA_ARTIFACTS = [
|
|
61
73
|
"_foreach_stack",
|
|
74
|
+
"_iteration_stack",
|
|
62
75
|
"_task_ok",
|
|
63
76
|
"_transition",
|
|
64
77
|
"_control_mapper_tasks",
|
|
@@ -66,6 +79,14 @@ PREFETCH_DATA_ARTIFACTS = [
|
|
|
66
79
|
]
|
|
67
80
|
RESUME_POLL_SECONDS = 60
|
|
68
81
|
|
|
82
|
+
|
|
83
|
+
class LoopBehavior(Enum):
|
|
84
|
+
NONE = "none"
|
|
85
|
+
ENTERING = "entering"
|
|
86
|
+
EXITING = "exiting"
|
|
87
|
+
LOOPING = "looping"
|
|
88
|
+
|
|
89
|
+
|
|
69
90
|
# Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
|
|
70
91
|
# formats according to mflog. See a comment in mflog.__init__
|
|
71
92
|
mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
|
|
@@ -73,6 +94,253 @@ mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
|
|
|
73
94
|
# TODO option: output dot graph periodically about execution
|
|
74
95
|
|
|
75
96
|
|
|
97
|
+
class SpinRuntime(object):
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
flow,
|
|
101
|
+
graph,
|
|
102
|
+
flow_datastore,
|
|
103
|
+
metadata,
|
|
104
|
+
environment,
|
|
105
|
+
package,
|
|
106
|
+
logger,
|
|
107
|
+
entrypoint,
|
|
108
|
+
event_logger,
|
|
109
|
+
monitor,
|
|
110
|
+
step_func,
|
|
111
|
+
step_name,
|
|
112
|
+
spin_pathspec,
|
|
113
|
+
skip_decorators=False,
|
|
114
|
+
artifacts_module=None,
|
|
115
|
+
persist=True,
|
|
116
|
+
max_log_size=MAX_LOG_SIZE,
|
|
117
|
+
):
|
|
118
|
+
from metaflow import Task
|
|
119
|
+
|
|
120
|
+
self._flow = flow
|
|
121
|
+
self._graph = graph
|
|
122
|
+
self._flow_datastore = flow_datastore
|
|
123
|
+
self._metadata = metadata
|
|
124
|
+
self._environment = environment
|
|
125
|
+
self._package = package
|
|
126
|
+
self._logger = logger
|
|
127
|
+
self._entrypoint = entrypoint
|
|
128
|
+
self._event_logger = event_logger
|
|
129
|
+
self._monitor = monitor
|
|
130
|
+
|
|
131
|
+
self._step_func = step_func
|
|
132
|
+
|
|
133
|
+
# Determine if we have a complete pathspec or need to get the task
|
|
134
|
+
if spin_pathspec:
|
|
135
|
+
parts = spin_pathspec.split("/")
|
|
136
|
+
if len(parts) == 4:
|
|
137
|
+
# Complete pathspec: flow/run/step/task_id
|
|
138
|
+
try:
|
|
139
|
+
# If user provides whole pathspec, we do not need to check namespace
|
|
140
|
+
task = Task(spin_pathspec, _namespace_check=False)
|
|
141
|
+
except Exception:
|
|
142
|
+
raise MetaflowException(
|
|
143
|
+
f"Invalid pathspec: {spin_pathspec} for step: {step_name}"
|
|
144
|
+
)
|
|
145
|
+
elif len(parts) == 3:
|
|
146
|
+
# Partial pathspec: flow/run/step - need to get the task
|
|
147
|
+
_, run_id, _ = parts
|
|
148
|
+
task = get_latest_task_pathspec(flow.name, step_name, run_id=run_id)
|
|
149
|
+
logger(
|
|
150
|
+
f"To make spin even faster, provide complete pathspec with task_id: {task.pathspec}",
|
|
151
|
+
system_msg=True,
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
raise MetaflowException(
|
|
155
|
+
f"Invalid pathspec format: {spin_pathspec}. Expected flow/run/step or flow/run/step/task_id"
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
# No pathspec provided, get latest task for this step
|
|
159
|
+
task = get_latest_task_pathspec(flow.name, step_name)
|
|
160
|
+
logger(
|
|
161
|
+
f"To make spin even faster, provide complete pathspec {task.pathspec}",
|
|
162
|
+
system_msg=True,
|
|
163
|
+
)
|
|
164
|
+
from_start("SpinRuntime: after getting task")
|
|
165
|
+
|
|
166
|
+
# Get the original FlowDatastore so we can use it to access artifacts from the
|
|
167
|
+
# spun task
|
|
168
|
+
meta_dict = task.metadata_dict
|
|
169
|
+
ds_type = meta_dict["ds-type"]
|
|
170
|
+
ds_root = meta_dict["ds-root"]
|
|
171
|
+
orig_datastore_impl = [d for d in DATASTORES if d.TYPE == ds_type][0]
|
|
172
|
+
orig_datastore_impl.datastore_root = ds_root
|
|
173
|
+
spin_pathspec = task.pathspec
|
|
174
|
+
orig_flow_datastore = FlowDataStore(
|
|
175
|
+
flow.name,
|
|
176
|
+
environment=None,
|
|
177
|
+
storage_impl=orig_datastore_impl,
|
|
178
|
+
ds_root=ds_root,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._filecache = FileCache()
|
|
182
|
+
orig_flow_datastore.set_metadata_cache(
|
|
183
|
+
TaskMetadataCache(self._filecache, ds_type, ds_root, flow.name)
|
|
184
|
+
)
|
|
185
|
+
orig_flow_datastore.ca_store.set_blob_cache(
|
|
186
|
+
FileBlobCache(
|
|
187
|
+
self._filecache, FileCache.flow_ds_id(ds_type, ds_root, flow.name)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
self._orig_flow_datastore = orig_flow_datastore
|
|
192
|
+
self._spin_pathspec = spin_pathspec
|
|
193
|
+
self._persist = persist
|
|
194
|
+
self._spin_task = task
|
|
195
|
+
self._input_paths = None
|
|
196
|
+
self._split_index = None
|
|
197
|
+
self._whitelist_decorators = None
|
|
198
|
+
self._config_file_name = None
|
|
199
|
+
self._skip_decorators = skip_decorators
|
|
200
|
+
self._artifacts_module = artifacts_module
|
|
201
|
+
self._max_log_size = max_log_size
|
|
202
|
+
self._encoding = sys.stdout.encoding or "UTF-8"
|
|
203
|
+
|
|
204
|
+
# Create a new run_id for the spin task
|
|
205
|
+
self.run_id = self._metadata.new_run_id()
|
|
206
|
+
# Raise exception if we have a black listed decorator
|
|
207
|
+
for deco in self._step_func.decorators:
|
|
208
|
+
if deco.name in SPIN_DISALLOWED_DECORATORS:
|
|
209
|
+
raise MetaflowException(
|
|
210
|
+
f"Spinning steps with @{deco.name} decorator is not supported."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
for deco in self.whitelist_decorators:
|
|
214
|
+
deco.runtime_init(flow, graph, package, self.run_id)
|
|
215
|
+
from_start("SpinRuntime: after init decorators")
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def split_index(self):
|
|
219
|
+
"""
|
|
220
|
+
Returns the split index, caching the result after the first access.
|
|
221
|
+
"""
|
|
222
|
+
if self._split_index is None:
|
|
223
|
+
self._split_index = getattr(self._spin_task, "index", None)
|
|
224
|
+
|
|
225
|
+
return self._split_index
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def input_paths(self):
|
|
229
|
+
def _format_input_paths(task_pathspec, attempt):
|
|
230
|
+
_, run_id, step_name, task_id = task_pathspec.split("/")
|
|
231
|
+
return f"{run_id}/{step_name}/{task_id}/{attempt}"
|
|
232
|
+
|
|
233
|
+
if self._input_paths:
|
|
234
|
+
return self._input_paths
|
|
235
|
+
|
|
236
|
+
if self._step_func.name == "start":
|
|
237
|
+
from metaflow import Step
|
|
238
|
+
|
|
239
|
+
flow_name, run_id, _, _ = self._spin_pathspec.split("/")
|
|
240
|
+
task = Step(
|
|
241
|
+
f"{flow_name}/{run_id}/_parameters", _namespace_check=False
|
|
242
|
+
).task
|
|
243
|
+
self._input_paths = [
|
|
244
|
+
_format_input_paths(task.pathspec, task.current_attempt)
|
|
245
|
+
]
|
|
246
|
+
else:
|
|
247
|
+
parent_tasks = self._spin_task.parent_tasks
|
|
248
|
+
self._input_paths = [
|
|
249
|
+
_format_input_paths(t.pathspec, t.current_attempt) for t in parent_tasks
|
|
250
|
+
]
|
|
251
|
+
return self._input_paths
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def whitelist_decorators(self):
|
|
255
|
+
if self._skip_decorators:
|
|
256
|
+
self._whitelist_decorators = []
|
|
257
|
+
return self._whitelist_decorators
|
|
258
|
+
if self._whitelist_decorators:
|
|
259
|
+
return self._whitelist_decorators
|
|
260
|
+
self._whitelist_decorators = [
|
|
261
|
+
deco
|
|
262
|
+
for deco in self._step_func.decorators
|
|
263
|
+
if any(deco.name.startswith(prefix) for prefix in SPIN_ALLOWED_DECORATORS)
|
|
264
|
+
]
|
|
265
|
+
return self._whitelist_decorators
|
|
266
|
+
|
|
267
|
+
def _new_task(self, step, input_paths=None, **kwargs):
|
|
268
|
+
return Task(
|
|
269
|
+
flow_datastore=self._flow_datastore,
|
|
270
|
+
flow=self._flow,
|
|
271
|
+
step=step,
|
|
272
|
+
run_id=self.run_id,
|
|
273
|
+
metadata=self._metadata,
|
|
274
|
+
environment=self._environment,
|
|
275
|
+
entrypoint=self._entrypoint,
|
|
276
|
+
event_logger=self._event_logger,
|
|
277
|
+
monitor=self._monitor,
|
|
278
|
+
input_paths=input_paths,
|
|
279
|
+
decos=self.whitelist_decorators,
|
|
280
|
+
logger=self._logger,
|
|
281
|
+
split_index=self.split_index,
|
|
282
|
+
**kwargs,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def execute(self):
|
|
286
|
+
exception = None
|
|
287
|
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
|
|
288
|
+
config_value = dump_config_values(self._flow)
|
|
289
|
+
if config_value:
|
|
290
|
+
json.dump(config_value, config_file)
|
|
291
|
+
config_file.flush()
|
|
292
|
+
self._config_file_name = config_file.name
|
|
293
|
+
else:
|
|
294
|
+
self._config_file_name = None
|
|
295
|
+
from_start("SpinRuntime: config values processed")
|
|
296
|
+
self.task = self._new_task(self._step_func.name, self.input_paths)
|
|
297
|
+
try:
|
|
298
|
+
self._launch_and_monitor_task()
|
|
299
|
+
except Exception as ex:
|
|
300
|
+
self._logger("Task failed.", system_msg=True, bad=True)
|
|
301
|
+
exception = ex
|
|
302
|
+
raise
|
|
303
|
+
finally:
|
|
304
|
+
for deco in self.whitelist_decorators:
|
|
305
|
+
deco.runtime_finished(exception)
|
|
306
|
+
|
|
307
|
+
def _launch_and_monitor_task(self):
|
|
308
|
+
worker = Worker(
|
|
309
|
+
self.task,
|
|
310
|
+
self._max_log_size,
|
|
311
|
+
self._config_file_name,
|
|
312
|
+
orig_flow_datastore=self._orig_flow_datastore,
|
|
313
|
+
spin_pathspec=self._spin_pathspec,
|
|
314
|
+
artifacts_module=self._artifacts_module,
|
|
315
|
+
persist=self._persist,
|
|
316
|
+
skip_decorators=self._skip_decorators,
|
|
317
|
+
)
|
|
318
|
+
from_start("SpinRuntime: created worker")
|
|
319
|
+
|
|
320
|
+
poll = procpoll.make_poll()
|
|
321
|
+
fds = worker.fds()
|
|
322
|
+
for fd in fds:
|
|
323
|
+
poll.add(fd)
|
|
324
|
+
|
|
325
|
+
active_fds = set(fds)
|
|
326
|
+
|
|
327
|
+
while active_fds:
|
|
328
|
+
events = poll.poll(POLL_TIMEOUT)
|
|
329
|
+
for event in events:
|
|
330
|
+
if event.can_read:
|
|
331
|
+
worker.read_logline(event.fd)
|
|
332
|
+
if event.is_terminated:
|
|
333
|
+
poll.remove(event.fd)
|
|
334
|
+
active_fds.remove(event.fd)
|
|
335
|
+
from_start("SpinRuntime: read loglines")
|
|
336
|
+
returncode = worker.terminate()
|
|
337
|
+
from_start("SpinRuntime: worker terminated")
|
|
338
|
+
if returncode != 0:
|
|
339
|
+
raise TaskFailed(self.task, f"Task failed with return code {returncode}")
|
|
340
|
+
else:
|
|
341
|
+
self._logger("Task finished successfully.", system_msg=True)
|
|
342
|
+
|
|
343
|
+
|
|
76
344
|
class NativeRuntime(object):
|
|
77
345
|
def __init__(
|
|
78
346
|
self,
|
|
@@ -95,6 +363,7 @@ class NativeRuntime(object):
|
|
|
95
363
|
max_num_splits=MAX_NUM_SPLITS,
|
|
96
364
|
max_log_size=MAX_LOG_SIZE,
|
|
97
365
|
resume_identifier=None,
|
|
366
|
+
skip_decorator_hooks=False,
|
|
98
367
|
):
|
|
99
368
|
if run_id is None:
|
|
100
369
|
self._run_id = metadata.new_run_id()
|
|
@@ -107,6 +376,7 @@ class NativeRuntime(object):
|
|
|
107
376
|
self._flow_datastore = flow_datastore
|
|
108
377
|
self._metadata = metadata
|
|
109
378
|
self._environment = environment
|
|
379
|
+
self._package = package
|
|
110
380
|
self._logger = logger
|
|
111
381
|
self._max_workers = max_workers
|
|
112
382
|
self._active_tasks = dict() # Key: step name;
|
|
@@ -128,6 +398,7 @@ class NativeRuntime(object):
|
|
|
128
398
|
self._ran_or_scheduled_task_index = set()
|
|
129
399
|
self._reentrant = reentrant
|
|
130
400
|
self._run_url = None
|
|
401
|
+
self._skip_decorator_hooks = skip_decorator_hooks
|
|
131
402
|
|
|
132
403
|
# If steps_to_rerun is specified, we will not clone them in resume mode.
|
|
133
404
|
self._steps_to_rerun = steps_to_rerun or {}
|
|
@@ -179,9 +450,10 @@ class NativeRuntime(object):
|
|
|
179
450
|
# finished.
|
|
180
451
|
self._control_num_splits = {} # control_task -> num_splits mapping
|
|
181
452
|
|
|
182
|
-
|
|
183
|
-
for
|
|
184
|
-
deco
|
|
453
|
+
if not self._skip_decorator_hooks:
|
|
454
|
+
for step in flow:
|
|
455
|
+
for deco in step.decorators:
|
|
456
|
+
deco.runtime_init(flow, graph, package, self._run_id)
|
|
185
457
|
|
|
186
458
|
def _new_task(self, step, input_paths=None, **kwargs):
|
|
187
459
|
if input_paths is None:
|
|
@@ -192,7 +464,7 @@ class NativeRuntime(object):
|
|
|
192
464
|
if step in self._steps_to_rerun:
|
|
193
465
|
may_clone = False
|
|
194
466
|
|
|
195
|
-
if step == "_parameters":
|
|
467
|
+
if step == "_parameters" or self._skip_decorator_hooks:
|
|
196
468
|
decos = []
|
|
197
469
|
else:
|
|
198
470
|
decos = getattr(self._flow, step).decorators
|
|
@@ -285,6 +557,7 @@ class NativeRuntime(object):
|
|
|
285
557
|
pathspec_index,
|
|
286
558
|
cloned_task_pathspec_index,
|
|
287
559
|
finished_tuple,
|
|
560
|
+
iteration_tuple,
|
|
288
561
|
ubf_context,
|
|
289
562
|
generate_task_obj,
|
|
290
563
|
verbose=False,
|
|
@@ -329,7 +602,7 @@ class NativeRuntime(object):
|
|
|
329
602
|
self._metadata,
|
|
330
603
|
origin_ds_set=self._origin_ds_set,
|
|
331
604
|
)
|
|
332
|
-
self._finished[(step_name, finished_tuple)] = task_pathspec
|
|
605
|
+
self._finished[(step_name, finished_tuple, iteration_tuple)] = task_pathspec
|
|
333
606
|
self._is_cloned[task_pathspec] = True
|
|
334
607
|
except Exception as e:
|
|
335
608
|
self._logger(
|
|
@@ -410,6 +683,7 @@ class NativeRuntime(object):
|
|
|
410
683
|
finished_tuple = tuple(
|
|
411
684
|
[s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
|
|
412
685
|
)
|
|
686
|
+
iteration_tuple = tuple(task_ds.get("_iteration_stack", ()))
|
|
413
687
|
cloned_task_pathspec_index = pathspec_index.split("/")[1]
|
|
414
688
|
if task_ds.get("_control_task_is_mapper_zero", False):
|
|
415
689
|
# Replace None with index 0 for control task as it is part of the
|
|
@@ -435,6 +709,7 @@ class NativeRuntime(object):
|
|
|
435
709
|
pathspec_index,
|
|
436
710
|
cloned_task_pathspec_index,
|
|
437
711
|
finished_tuple,
|
|
712
|
+
iteration_tuple,
|
|
438
713
|
is_ubf_mapper_task,
|
|
439
714
|
ubf_context,
|
|
440
715
|
)
|
|
@@ -449,6 +724,7 @@ class NativeRuntime(object):
|
|
|
449
724
|
pathspec_index,
|
|
450
725
|
cloned_task_pathspec_index,
|
|
451
726
|
finished_tuple,
|
|
727
|
+
iteration_tuple,
|
|
452
728
|
ubf_context=ubf_context,
|
|
453
729
|
generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
|
|
454
730
|
verbose=verbose,
|
|
@@ -459,6 +735,7 @@ class NativeRuntime(object):
|
|
|
459
735
|
pathspec_index,
|
|
460
736
|
cloned_task_pathspec_index,
|
|
461
737
|
finished_tuple,
|
|
738
|
+
iteration_tuple,
|
|
462
739
|
is_ubf_mapper_task,
|
|
463
740
|
ubf_context,
|
|
464
741
|
) in inputs
|
|
@@ -479,6 +756,7 @@ class NativeRuntime(object):
|
|
|
479
756
|
self._queue_push("start", {"input_paths": [self._params_task.path]})
|
|
480
757
|
else:
|
|
481
758
|
self._queue_push("start", {})
|
|
759
|
+
|
|
482
760
|
progress_tstamp = time.time()
|
|
483
761
|
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
|
|
484
762
|
# Configurations are passed through a file to avoid overloading the
|
|
@@ -499,7 +777,74 @@ class NativeRuntime(object):
|
|
|
499
777
|
):
|
|
500
778
|
# 1. are any of the current workers finished?
|
|
501
779
|
if self._cloned_tasks:
|
|
502
|
-
finished_tasks =
|
|
780
|
+
finished_tasks = []
|
|
781
|
+
|
|
782
|
+
# For loops (right now just recursive steps), we need to find
|
|
783
|
+
# the exact frontier because if we queue all "successors" to all
|
|
784
|
+
# the finished iterations, we would incorrectly launch multiple
|
|
785
|
+
# successors. We therefore have to strip out all non-last
|
|
786
|
+
# iterations *per* foreach branch.
|
|
787
|
+
idx_per_finished_id = (
|
|
788
|
+
{}
|
|
789
|
+
) # type: Dict[Tuple[str, Tuple[int, ...], Tuple[int, Tuple[int, ...]]]]
|
|
790
|
+
for task in self._cloned_tasks:
|
|
791
|
+
step_name, foreach_stack, iteration_stack = task.finished_id
|
|
792
|
+
existing_task_idx = idx_per_finished_id.get(
|
|
793
|
+
(step_name, foreach_stack), None
|
|
794
|
+
)
|
|
795
|
+
if existing_task_idx is not None:
|
|
796
|
+
len_diff = len(iteration_stack) - len(
|
|
797
|
+
existing_task_idx[1]
|
|
798
|
+
)
|
|
799
|
+
# In this case, we need to keep only the latest iteration
|
|
800
|
+
if (
|
|
801
|
+
len_diff == 0
|
|
802
|
+
and iteration_stack > existing_task_idx[1]
|
|
803
|
+
) or len_diff == -1:
|
|
804
|
+
# We remove the one we currently have and replace
|
|
805
|
+
# by this one. The second option means that we are
|
|
806
|
+
# adding the finished iteration marker.
|
|
807
|
+
existing_task = finished_tasks[existing_task_idx[0]]
|
|
808
|
+
# These are the first two lines of _queue_tasks
|
|
809
|
+
# We still consider the tasks finished so we need
|
|
810
|
+
# to update state to be clean.
|
|
811
|
+
self._finished[existing_task.finished_id] = (
|
|
812
|
+
existing_task.path
|
|
813
|
+
)
|
|
814
|
+
self._is_cloned[existing_task.path] = (
|
|
815
|
+
existing_task.is_cloned
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
finished_tasks[existing_task_idx[0]] = task
|
|
819
|
+
idx_per_finished_id[(step_name, foreach_stack)] = (
|
|
820
|
+
existing_task_idx[0],
|
|
821
|
+
iteration_stack,
|
|
822
|
+
)
|
|
823
|
+
elif (
|
|
824
|
+
len_diff == 0
|
|
825
|
+
and iteration_stack < existing_task_idx[1]
|
|
826
|
+
) or len_diff == 1:
|
|
827
|
+
# The second option is when we have already marked
|
|
828
|
+
# the end of the iteration in self._finished and
|
|
829
|
+
# are now seeing a previous iteration.
|
|
830
|
+
# We just mark the task as finished but we don't
|
|
831
|
+
# put it in the finished_tasks list to pass to
|
|
832
|
+
# the _queue_tasks function
|
|
833
|
+
self._finished[task.finished_id] = task.path
|
|
834
|
+
self._is_cloned[task.path] = task.is_cloned
|
|
835
|
+
else:
|
|
836
|
+
raise MetaflowInternalError(
|
|
837
|
+
"Unexpected recursive cloned tasks -- "
|
|
838
|
+
"this is a bug, please report it."
|
|
839
|
+
)
|
|
840
|
+
else:
|
|
841
|
+
# New entry
|
|
842
|
+
finished_tasks.append(task)
|
|
843
|
+
idx_per_finished_id[(step_name, foreach_stack)] = (
|
|
844
|
+
len(finished_tasks) - 1,
|
|
845
|
+
iteration_stack,
|
|
846
|
+
)
|
|
847
|
+
|
|
503
848
|
# reset the list of cloned tasks and let poll_workers handle
|
|
504
849
|
# the remaining transition
|
|
505
850
|
self._cloned_tasks = []
|
|
@@ -566,12 +911,14 @@ class NativeRuntime(object):
|
|
|
566
911
|
raise
|
|
567
912
|
finally:
|
|
568
913
|
# on finish clean tasks
|
|
569
|
-
|
|
570
|
-
for
|
|
571
|
-
deco.
|
|
914
|
+
if not self._skip_decorator_hooks:
|
|
915
|
+
for step in self._flow:
|
|
916
|
+
for deco in step.decorators:
|
|
917
|
+
deco.runtime_finished(exception)
|
|
918
|
+
self._run_exit_hooks()
|
|
572
919
|
|
|
573
920
|
# assert that end was executed and it was successful
|
|
574
|
-
if ("end", ()) in self._finished:
|
|
921
|
+
if ("end", (), ()) in self._finished:
|
|
575
922
|
if self._run_url:
|
|
576
923
|
self._logger(
|
|
577
924
|
"Done! See the run in the UI at %s" % self._run_url,
|
|
@@ -591,6 +938,51 @@ class NativeRuntime(object):
|
|
|
591
938
|
"The *end* step was not successful by the end of flow."
|
|
592
939
|
)
|
|
593
940
|
|
|
941
|
+
def _run_exit_hooks(self):
|
|
942
|
+
try:
|
|
943
|
+
flow_decos = self._flow._flow_state[FlowStateItems.FLOW_DECORATORS]
|
|
944
|
+
exit_hook_decos = flow_decos.get("exit_hook", [])
|
|
945
|
+
if not exit_hook_decos:
|
|
946
|
+
return
|
|
947
|
+
|
|
948
|
+
successful = ("end", (), ()) in self._finished or self._clone_only
|
|
949
|
+
pathspec = f"{self._graph.name}/{self._run_id}"
|
|
950
|
+
flow_file = self._environment.get_environment_info()["script"]
|
|
951
|
+
|
|
952
|
+
def _call(fn_name):
|
|
953
|
+
try:
|
|
954
|
+
result = (
|
|
955
|
+
subprocess.check_output(
|
|
956
|
+
args=[
|
|
957
|
+
sys.executable,
|
|
958
|
+
"-m",
|
|
959
|
+
"metaflow.plugins.exit_hook.exit_hook_script",
|
|
960
|
+
flow_file,
|
|
961
|
+
fn_name,
|
|
962
|
+
pathspec,
|
|
963
|
+
],
|
|
964
|
+
env=os.environ,
|
|
965
|
+
)
|
|
966
|
+
.decode()
|
|
967
|
+
.strip()
|
|
968
|
+
)
|
|
969
|
+
print(result)
|
|
970
|
+
except subprocess.CalledProcessError as e:
|
|
971
|
+
print(f"[exit_hook] Hook '{fn_name}' failed with error: {e}")
|
|
972
|
+
except Exception as e:
|
|
973
|
+
print(f"[exit_hook] Unexpected error in hook '{fn_name}': {e}")
|
|
974
|
+
|
|
975
|
+
# Call all exit hook functions regardless of individual failures
|
|
976
|
+
for fn_name in [
|
|
977
|
+
name
|
|
978
|
+
for deco in exit_hook_decos
|
|
979
|
+
for name in (deco.success_hooks if successful else deco.error_hooks)
|
|
980
|
+
]:
|
|
981
|
+
_call(fn_name)
|
|
982
|
+
|
|
983
|
+
except Exception as ex:
|
|
984
|
+
pass # do not fail due to exit hooks for whatever reason.
|
|
985
|
+
|
|
594
986
|
def _killall(self):
|
|
595
987
|
# If we are here, all children have received a signal and are shutting down.
|
|
596
988
|
# We want to give them an opportunity to do so and then kill
|
|
@@ -621,30 +1013,70 @@ class NativeRuntime(object):
|
|
|
621
1013
|
|
|
622
1014
|
# Given the current task information (task_index), the type of transition,
|
|
623
1015
|
# and the split index, return the new task index.
|
|
624
|
-
def _translate_index(
|
|
625
|
-
|
|
626
|
-
|
|
1016
|
+
def _translate_index(
|
|
1017
|
+
self, task, next_step, type, split_index=None, loop_mode=LoopBehavior.NONE
|
|
1018
|
+
):
|
|
1019
|
+
match = re.match(r"^(.+)\[(.*)\]\[(.*)\]$", task.task_index)
|
|
1020
|
+
old_match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
|
|
627
1021
|
if match:
|
|
628
|
-
_, foreach_index = match.groups()
|
|
1022
|
+
_, foreach_index, iteration_index = match.groups()
|
|
629
1023
|
# Convert foreach_index to a list of integers
|
|
630
1024
|
if len(foreach_index) > 0:
|
|
631
1025
|
foreach_index = foreach_index.split(",")
|
|
632
1026
|
else:
|
|
633
1027
|
foreach_index = []
|
|
1028
|
+
# Ditto for iteration_index
|
|
1029
|
+
if len(iteration_index) > 0:
|
|
1030
|
+
iteration_index = iteration_index.split(",")
|
|
1031
|
+
else:
|
|
1032
|
+
iteration_index = []
|
|
1033
|
+
elif old_match:
|
|
1034
|
+
_, foreach_index = old_match.groups()
|
|
1035
|
+
# Convert foreach_index to a list of integers
|
|
1036
|
+
if len(foreach_index) > 0:
|
|
1037
|
+
foreach_index = foreach_index.split(",")
|
|
1038
|
+
else:
|
|
1039
|
+
foreach_index = []
|
|
1040
|
+
# Legacy case fallback. No iteration index exists for these runs.
|
|
1041
|
+
iteration_index = []
|
|
634
1042
|
else:
|
|
635
1043
|
raise ValueError(
|
|
636
|
-
"Index not in the format of {run_id}/{step_name}[{foreach_index}]"
|
|
1044
|
+
"Index not in the format of {run_id}/{step_name}[{foreach_index}][{iteration_index}]"
|
|
637
1045
|
)
|
|
1046
|
+
if loop_mode == LoopBehavior.NONE:
|
|
1047
|
+
# Check if we are entering a looping construct. Right now, only recursive
|
|
1048
|
+
# steps are looping constructs
|
|
1049
|
+
next_step_node = self._graph[next_step]
|
|
1050
|
+
if (
|
|
1051
|
+
next_step_node.type == "split-switch"
|
|
1052
|
+
and next_step in next_step_node.out_funcs
|
|
1053
|
+
):
|
|
1054
|
+
loop_mode = LoopBehavior.ENTERING
|
|
1055
|
+
|
|
1056
|
+
# Update iteration_index
|
|
1057
|
+
if loop_mode == LoopBehavior.ENTERING:
|
|
1058
|
+
# We are entering a loop, so we add a new iteration level
|
|
1059
|
+
iteration_index.append("0")
|
|
1060
|
+
elif loop_mode == LoopBehavior.EXITING:
|
|
1061
|
+
iteration_index = iteration_index[:-1]
|
|
1062
|
+
elif loop_mode == LoopBehavior.LOOPING:
|
|
1063
|
+
if len(iteration_index) == 0:
|
|
1064
|
+
raise MetaflowInternalError(
|
|
1065
|
+
"In looping mode but there is no iteration index"
|
|
1066
|
+
)
|
|
1067
|
+
iteration_index[-1] = str(int(iteration_index[-1]) + 1)
|
|
1068
|
+
iteration_index = ",".join(iteration_index)
|
|
1069
|
+
|
|
638
1070
|
if type == "linear":
|
|
639
|
-
return "%s[%s]" % (next_step, ",".join(foreach_index))
|
|
1071
|
+
return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
|
|
640
1072
|
elif type == "join":
|
|
641
1073
|
indices = []
|
|
642
1074
|
if len(foreach_index) > 0:
|
|
643
1075
|
indices = foreach_index[:-1]
|
|
644
|
-
return "%s[%s]" % (next_step, ",".join(indices))
|
|
1076
|
+
return "%s[%s][%s]" % (next_step, ",".join(indices), iteration_index)
|
|
645
1077
|
elif type == "split":
|
|
646
1078
|
foreach_index.append(str(split_index))
|
|
647
|
-
return "%s[%s]" % (next_step, ",".join(foreach_index))
|
|
1079
|
+
return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
|
|
648
1080
|
|
|
649
1081
|
# Store the parameters needed for task creation, so that pushing on items
|
|
650
1082
|
# onto the run_queue is an inexpensive operation.
|
|
@@ -728,17 +1160,19 @@ class NativeRuntime(object):
|
|
|
728
1160
|
# tasks is incorrect and contains the pathspec of the *cloned* run
|
|
729
1161
|
# but we don't use it for anything. We could look to clean it up though
|
|
730
1162
|
if not task.is_cloned:
|
|
731
|
-
_, foreach_stack = task.finished_id
|
|
1163
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
732
1164
|
top = foreach_stack[-1]
|
|
733
1165
|
bottom = list(foreach_stack[:-1])
|
|
734
1166
|
for i in range(num_splits):
|
|
735
1167
|
s = tuple(bottom + [top._replace(index=i)])
|
|
736
|
-
self._finished[(task.step, s)] = mapper_tasks[
|
|
1168
|
+
self._finished[(task.step, s, iteration_stack)] = mapper_tasks[
|
|
1169
|
+
i
|
|
1170
|
+
]
|
|
737
1171
|
self._is_cloned[mapper_tasks[i]] = False
|
|
738
1172
|
|
|
739
1173
|
# Find and check status of control task and retrieve its pathspec
|
|
740
1174
|
# for retrieving unbounded foreach cardinality.
|
|
741
|
-
_, foreach_stack = task.finished_id
|
|
1175
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
742
1176
|
top = foreach_stack[-1]
|
|
743
1177
|
bottom = list(foreach_stack[:-1])
|
|
744
1178
|
s = tuple(bottom + [top._replace(index=None)])
|
|
@@ -747,7 +1181,7 @@ class NativeRuntime(object):
|
|
|
747
1181
|
# it will have index=0 instead of index=None.
|
|
748
1182
|
if task.results.get("_control_task_is_mapper_zero", False):
|
|
749
1183
|
s = tuple(bottom + [top._replace(index=0)])
|
|
750
|
-
control_path = self._finished.get((task.step, s))
|
|
1184
|
+
control_path = self._finished.get((task.step, s, iteration_stack))
|
|
751
1185
|
if control_path:
|
|
752
1186
|
# Control task was successful.
|
|
753
1187
|
# Additionally check the state of (sibling) mapper tasks as well
|
|
@@ -756,7 +1190,9 @@ class NativeRuntime(object):
|
|
|
756
1190
|
required_tasks = []
|
|
757
1191
|
for i in range(num_splits):
|
|
758
1192
|
s = tuple(bottom + [top._replace(index=i)])
|
|
759
|
-
required_tasks.append(
|
|
1193
|
+
required_tasks.append(
|
|
1194
|
+
self._finished.get((task.step, s, iteration_stack))
|
|
1195
|
+
)
|
|
760
1196
|
|
|
761
1197
|
if all(required_tasks):
|
|
762
1198
|
index = self._translate_index(task, next_step, "join")
|
|
@@ -769,10 +1205,12 @@ class NativeRuntime(object):
|
|
|
769
1205
|
else:
|
|
770
1206
|
# matching_split is the split-parent of the finished task
|
|
771
1207
|
matching_split = self._graph[self._graph[next_step].split_parents[-1]]
|
|
772
|
-
_, foreach_stack = task.finished_id
|
|
773
|
-
|
|
1208
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
1209
|
+
|
|
1210
|
+
direct_parents = set(self._graph[next_step].in_funcs)
|
|
1211
|
+
|
|
1212
|
+
# next step is a foreach join
|
|
774
1213
|
if matching_split.type == "foreach":
|
|
775
|
-
# next step is a foreach join
|
|
776
1214
|
|
|
777
1215
|
def siblings(foreach_stack):
|
|
778
1216
|
top = foreach_stack[-1]
|
|
@@ -781,29 +1219,56 @@ class NativeRuntime(object):
|
|
|
781
1219
|
yield tuple(bottom + [top._replace(index=index)])
|
|
782
1220
|
|
|
783
1221
|
# required tasks are all split-siblings of the finished task
|
|
784
|
-
required_tasks =
|
|
785
|
-
|
|
786
|
-
|
|
1222
|
+
required_tasks = list(
|
|
1223
|
+
filter(
|
|
1224
|
+
lambda x: x is not None,
|
|
1225
|
+
[
|
|
1226
|
+
self._finished.get((p, s, iteration_stack))
|
|
1227
|
+
for p in direct_parents
|
|
1228
|
+
for s in siblings(foreach_stack)
|
|
1229
|
+
],
|
|
1230
|
+
)
|
|
1231
|
+
)
|
|
1232
|
+
required_count = task.finished_id[1][-1].num_splits
|
|
787
1233
|
join_type = "foreach"
|
|
788
1234
|
index = self._translate_index(task, next_step, "join")
|
|
789
1235
|
else:
|
|
790
1236
|
# next step is a split
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
1237
|
+
required_tasks = list(
|
|
1238
|
+
filter(
|
|
1239
|
+
lambda x: x is not None,
|
|
1240
|
+
[
|
|
1241
|
+
self._finished.get((p, foreach_stack, iteration_stack))
|
|
1242
|
+
for p in direct_parents
|
|
1243
|
+
],
|
|
1244
|
+
)
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
required_count = len(matching_split.out_funcs)
|
|
796
1248
|
join_type = "linear"
|
|
797
1249
|
index = self._translate_index(task, next_step, "linear")
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
# all tasks to be joined are ready. Schedule the next join step.
|
|
1250
|
+
if len(required_tasks) == required_count:
|
|
1251
|
+
# We have all the required previous tasks to schedule a join
|
|
801
1252
|
self._queue_push(
|
|
802
1253
|
next_step,
|
|
803
1254
|
{"input_paths": required_tasks, "join_type": join_type},
|
|
804
1255
|
index,
|
|
805
1256
|
)
|
|
806
1257
|
|
|
1258
|
+
def _queue_task_switch(self, task, next_steps, is_recursive):
|
|
1259
|
+
chosen_step = next_steps[0]
|
|
1260
|
+
|
|
1261
|
+
loop_mode = LoopBehavior.NONE
|
|
1262
|
+
if is_recursive:
|
|
1263
|
+
if chosen_step != task.step:
|
|
1264
|
+
# We are exiting a loop
|
|
1265
|
+
loop_mode = LoopBehavior.EXITING
|
|
1266
|
+
else:
|
|
1267
|
+
# We are staying in the loop
|
|
1268
|
+
loop_mode = LoopBehavior.LOOPING
|
|
1269
|
+
index = self._translate_index(task, chosen_step, "linear", None, loop_mode)
|
|
1270
|
+
self._queue_push(chosen_step, {"input_paths": [task.path]}, index)
|
|
1271
|
+
|
|
807
1272
|
def _queue_task_foreach(self, task, next_steps):
|
|
808
1273
|
# CHECK: this condition should be enforced by the linter but
|
|
809
1274
|
# let's assert that the assumption holds
|
|
@@ -880,7 +1345,39 @@ class NativeRuntime(object):
|
|
|
880
1345
|
next_steps = []
|
|
881
1346
|
foreach = None
|
|
882
1347
|
expected = self._graph[task.step].out_funcs
|
|
883
|
-
|
|
1348
|
+
|
|
1349
|
+
if self._graph[task.step].type == "split-switch":
|
|
1350
|
+
is_recursive = task.step in self._graph[task.step].out_funcs
|
|
1351
|
+
if len(next_steps) != 1:
|
|
1352
|
+
msg = (
|
|
1353
|
+
"Switch step *{step}* should transition to exactly "
|
|
1354
|
+
"one step at runtime, but got: {actual}"
|
|
1355
|
+
)
|
|
1356
|
+
raise MetaflowInternalError(
|
|
1357
|
+
msg.format(step=task.step, actual=", ".join(next_steps))
|
|
1358
|
+
)
|
|
1359
|
+
if next_steps[0] not in expected:
|
|
1360
|
+
msg = (
|
|
1361
|
+
"Switch step *{step}* transitioned to unexpected "
|
|
1362
|
+
"step *{actual}*. Expected one of: {expected}"
|
|
1363
|
+
)
|
|
1364
|
+
raise MetaflowInternalError(
|
|
1365
|
+
msg.format(
|
|
1366
|
+
step=task.step,
|
|
1367
|
+
actual=next_steps[0],
|
|
1368
|
+
expected=", ".join(expected),
|
|
1369
|
+
)
|
|
1370
|
+
)
|
|
1371
|
+
# When exiting a recursive loop, we mark that the loop itself has
|
|
1372
|
+
# finished by adding a special entry in self._finished which has
|
|
1373
|
+
# an iteration stack that is shorter (ie: we are out of the loop) so
|
|
1374
|
+
# that we can then find it when looking at successor tasks to launch.
|
|
1375
|
+
if is_recursive and next_steps[0] != task.step:
|
|
1376
|
+
step_name, finished_tuple, iteration_tuple = task.finished_id
|
|
1377
|
+
self._finished[
|
|
1378
|
+
(step_name, finished_tuple, iteration_tuple[:-1])
|
|
1379
|
+
] = task.path
|
|
1380
|
+
elif next_steps != expected:
|
|
884
1381
|
msg = (
|
|
885
1382
|
"Based on static analysis of the code, step *{step}* "
|
|
886
1383
|
"was expected to transition to step(s) *{expected}*. "
|
|
@@ -904,6 +1401,9 @@ class NativeRuntime(object):
|
|
|
904
1401
|
elif foreach:
|
|
905
1402
|
# Next step is a foreach child
|
|
906
1403
|
self._queue_task_foreach(task, next_steps)
|
|
1404
|
+
elif self._graph[task.step].type == "split-switch":
|
|
1405
|
+
# Current step is switch - queue the chosen step
|
|
1406
|
+
self._queue_task_switch(task, next_steps, is_recursive)
|
|
907
1407
|
else:
|
|
908
1408
|
# Next steps are normal linear steps
|
|
909
1409
|
for step in next_steps:
|
|
@@ -960,6 +1460,22 @@ class NativeRuntime(object):
|
|
|
960
1460
|
# Initialize the task (which can be expensive using remote datastores)
|
|
961
1461
|
# before launching the worker so that cost is amortized over time, instead
|
|
962
1462
|
# of doing it during _queue_push.
|
|
1463
|
+
if (
|
|
1464
|
+
FEAT_ALWAYS_UPLOAD_CODE_PACKAGE
|
|
1465
|
+
and "METAFLOW_CODE_SHA" not in os.environ
|
|
1466
|
+
):
|
|
1467
|
+
# We check if the code package is uploaded and, if so, we set the
|
|
1468
|
+
# environment variables that will cause the metadata service to
|
|
1469
|
+
# register the code package with the task created in _new_task below
|
|
1470
|
+
code_sha = self._package.package_sha(timeout=0.01)
|
|
1471
|
+
if code_sha:
|
|
1472
|
+
os.environ["METAFLOW_CODE_SHA"] = code_sha
|
|
1473
|
+
os.environ["METAFLOW_CODE_URL"] = self._package.package_url()
|
|
1474
|
+
os.environ["METAFLOW_CODE_DS"] = self._flow_datastore.TYPE
|
|
1475
|
+
os.environ["METAFLOW_CODE_METADATA"] = (
|
|
1476
|
+
self._package.package_metadata
|
|
1477
|
+
)
|
|
1478
|
+
|
|
963
1479
|
task = self._new_task(step, **task_kwargs)
|
|
964
1480
|
self._launch_worker(task)
|
|
965
1481
|
|
|
@@ -1428,13 +1944,15 @@ class Task(object):
|
|
|
1428
1944
|
@property
|
|
1429
1945
|
def finished_id(self):
|
|
1430
1946
|
# note: id is not available before the task has finished.
|
|
1431
|
-
# Index already identifies the task within the foreach
|
|
1432
|
-
#
|
|
1947
|
+
# Index already identifies the task within the foreach and loop.
|
|
1948
|
+
# We will remove foreach value so that it is easier to
|
|
1433
1949
|
# identify siblings within a foreach.
|
|
1434
1950
|
foreach_stack_tuple = tuple(
|
|
1435
1951
|
[s._replace(value=0) for s in self.results["_foreach_stack"]]
|
|
1436
1952
|
)
|
|
1437
|
-
|
|
1953
|
+
# _iteration_stack requires a fallback, as it does not exist for runs before v2.17.4
|
|
1954
|
+
iteration_stack_tuple = tuple(self.results.get("_iteration_stack", []))
|
|
1955
|
+
return (self.step, foreach_stack_tuple, iteration_stack_tuple)
|
|
1438
1956
|
|
|
1439
1957
|
@property
|
|
1440
1958
|
def is_cloned(self):
|
|
@@ -1508,9 +2026,29 @@ class CLIArgs(object):
|
|
|
1508
2026
|
for step execution in StepDecorator.runtime_step_cli().
|
|
1509
2027
|
"""
|
|
1510
2028
|
|
|
1511
|
-
def __init__(
|
|
2029
|
+
def __init__(
|
|
2030
|
+
self,
|
|
2031
|
+
task,
|
|
2032
|
+
orig_flow_datastore=None,
|
|
2033
|
+
spin_pathspec=None,
|
|
2034
|
+
artifacts_module=None,
|
|
2035
|
+
persist=True,
|
|
2036
|
+
skip_decorators=False,
|
|
2037
|
+
):
|
|
1512
2038
|
self.task = task
|
|
2039
|
+
if orig_flow_datastore is not None:
|
|
2040
|
+
self.orig_flow_datastore = "%s@%s" % (
|
|
2041
|
+
orig_flow_datastore.TYPE,
|
|
2042
|
+
orig_flow_datastore.datastore_root,
|
|
2043
|
+
)
|
|
2044
|
+
else:
|
|
2045
|
+
self.orig_flow_datastore = None
|
|
2046
|
+
self.spin_pathspec = spin_pathspec
|
|
2047
|
+
self.artifacts_module = artifacts_module
|
|
2048
|
+
self.persist = persist
|
|
2049
|
+
self.skip_decorators = skip_decorators
|
|
1513
2050
|
self.entrypoint = list(task.entrypoint)
|
|
2051
|
+
step_obj = getattr(self.task.flow, self.task.step)
|
|
1514
2052
|
self.top_level_options = {
|
|
1515
2053
|
"quiet": True,
|
|
1516
2054
|
"metadata": self.task.metadata_type,
|
|
@@ -1522,8 +2060,12 @@ class CLIArgs(object):
|
|
|
1522
2060
|
"datastore-root": self.task.datastore_sysroot,
|
|
1523
2061
|
"with": [
|
|
1524
2062
|
deco.make_decorator_spec()
|
|
1525
|
-
for deco in
|
|
1526
|
-
|
|
2063
|
+
for deco in chain(
|
|
2064
|
+
self.task.decos,
|
|
2065
|
+
step_obj.wrappers,
|
|
2066
|
+
step_obj.config_decorators,
|
|
2067
|
+
)
|
|
2068
|
+
if not deco.statically_defined and deco.inserted_by is None
|
|
1527
2069
|
],
|
|
1528
2070
|
}
|
|
1529
2071
|
|
|
@@ -1536,27 +2078,55 @@ class CLIArgs(object):
|
|
|
1536
2078
|
# We also pass configuration options using the kv.<name> syntax which will cause
|
|
1537
2079
|
# the configuration options to be loaded from the CONFIG file (or local-config-file
|
|
1538
2080
|
# in the case of the local runtime)
|
|
1539
|
-
configs = self.task.flow._flow_state.
|
|
2081
|
+
configs = self.task.flow._flow_state[FlowStateItems.CONFIGS]
|
|
1540
2082
|
if configs:
|
|
1541
2083
|
self.top_level_options["config-value"] = [
|
|
1542
2084
|
(k, ConfigInput.make_key_name(k)) for k in configs
|
|
1543
2085
|
]
|
|
1544
2086
|
|
|
2087
|
+
if spin_pathspec:
|
|
2088
|
+
self.spin_args()
|
|
2089
|
+
else:
|
|
2090
|
+
self.default_args()
|
|
2091
|
+
|
|
2092
|
+
def default_args(self):
|
|
1545
2093
|
self.commands = ["step"]
|
|
1546
2094
|
self.command_args = [self.task.step]
|
|
1547
2095
|
self.command_options = {
|
|
1548
|
-
"run-id": task.run_id,
|
|
1549
|
-
"task-id": task.task_id,
|
|
1550
|
-
"input-paths": compress_list(task.input_paths),
|
|
1551
|
-
"split-index": task.split_index,
|
|
1552
|
-
"retry-count": task.retries,
|
|
1553
|
-
"max-user-code-retries": task.user_code_retries,
|
|
1554
|
-
"tag": task.tags,
|
|
2096
|
+
"run-id": self.task.run_id,
|
|
2097
|
+
"task-id": self.task.task_id,
|
|
2098
|
+
"input-paths": compress_list(self.task.input_paths),
|
|
2099
|
+
"split-index": self.task.split_index,
|
|
2100
|
+
"retry-count": self.task.retries,
|
|
2101
|
+
"max-user-code-retries": self.task.user_code_retries,
|
|
2102
|
+
"tag": self.task.tags,
|
|
1555
2103
|
"namespace": get_namespace() or "",
|
|
1556
|
-
"ubf-context": task.ubf_context,
|
|
2104
|
+
"ubf-context": self.task.ubf_context,
|
|
1557
2105
|
}
|
|
1558
2106
|
self.env = {}
|
|
1559
2107
|
|
|
2108
|
+
def spin_args(self):
|
|
2109
|
+
self.commands = ["spin-step"]
|
|
2110
|
+
self.command_args = [self.task.step]
|
|
2111
|
+
|
|
2112
|
+
self.command_options = {
|
|
2113
|
+
"run-id": self.task.run_id,
|
|
2114
|
+
"task-id": self.task.task_id,
|
|
2115
|
+
"input-paths": compress_list(self.task.input_paths),
|
|
2116
|
+
"split-index": self.task.split_index,
|
|
2117
|
+
"retry-count": self.task.retries,
|
|
2118
|
+
"max-user-code-retries": self.task.user_code_retries,
|
|
2119
|
+
"namespace": get_namespace() or "",
|
|
2120
|
+
"orig-flow-datastore": self.orig_flow_datastore,
|
|
2121
|
+
"artifacts-module": self.artifacts_module,
|
|
2122
|
+
"skip-decorators": self.skip_decorators,
|
|
2123
|
+
}
|
|
2124
|
+
if self.persist:
|
|
2125
|
+
self.command_options["persist"] = True
|
|
2126
|
+
else:
|
|
2127
|
+
self.command_options["no-persist"] = True
|
|
2128
|
+
self.env = {}
|
|
2129
|
+
|
|
1560
2130
|
def get_args(self):
|
|
1561
2131
|
# TODO: Make one with dict_to_cli_options; see cli_args.py for more detail
|
|
1562
2132
|
def _options(mapping):
|
|
@@ -1595,9 +2165,24 @@ class CLIArgs(object):
|
|
|
1595
2165
|
|
|
1596
2166
|
|
|
1597
2167
|
class Worker(object):
|
|
1598
|
-
def __init__(
|
|
2168
|
+
def __init__(
|
|
2169
|
+
self,
|
|
2170
|
+
task,
|
|
2171
|
+
max_logs_size,
|
|
2172
|
+
config_file_name,
|
|
2173
|
+
orig_flow_datastore=None,
|
|
2174
|
+
spin_pathspec=None,
|
|
2175
|
+
artifacts_module=None,
|
|
2176
|
+
persist=True,
|
|
2177
|
+
skip_decorators=False,
|
|
2178
|
+
):
|
|
1599
2179
|
self.task = task
|
|
1600
2180
|
self._config_file_name = config_file_name
|
|
2181
|
+
self._orig_flow_datastore = orig_flow_datastore
|
|
2182
|
+
self._spin_pathspec = spin_pathspec
|
|
2183
|
+
self._artifacts_module = artifacts_module
|
|
2184
|
+
self._skip_decorators = skip_decorators
|
|
2185
|
+
self._persist = persist
|
|
1601
2186
|
self._proc = self._launch()
|
|
1602
2187
|
|
|
1603
2188
|
if task.retries > task.user_code_retries:
|
|
@@ -1629,7 +2214,14 @@ class Worker(object):
|
|
|
1629
2214
|
# not it is properly shut down)
|
|
1630
2215
|
|
|
1631
2216
|
def _launch(self):
|
|
1632
|
-
args = CLIArgs(
|
|
2217
|
+
args = CLIArgs(
|
|
2218
|
+
self.task,
|
|
2219
|
+
orig_flow_datastore=self._orig_flow_datastore,
|
|
2220
|
+
spin_pathspec=self._spin_pathspec,
|
|
2221
|
+
artifacts_module=self._artifacts_module,
|
|
2222
|
+
persist=self._persist,
|
|
2223
|
+
skip_decorators=self._skip_decorators,
|
|
2224
|
+
)
|
|
1633
2225
|
env = dict(os.environ)
|
|
1634
2226
|
|
|
1635
2227
|
if self.task.clone_run_id:
|
|
@@ -1662,6 +2254,7 @@ class Worker(object):
|
|
|
1662
2254
|
# by read_logline() below that relies on readline() not blocking
|
|
1663
2255
|
# print('running', args)
|
|
1664
2256
|
cmdline = args.get_args()
|
|
2257
|
+
from_start(f"Command line: {' '.join(cmdline)}")
|
|
1665
2258
|
debug.subcommand_exec(cmdline)
|
|
1666
2259
|
return subprocess.Popen(
|
|
1667
2260
|
cmdline,
|
|
@@ -1784,13 +2377,14 @@ class Worker(object):
|
|
|
1784
2377
|
else:
|
|
1785
2378
|
self.emit_log(b"Task failed.", self._stderr, system_msg=True)
|
|
1786
2379
|
else:
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
2380
|
+
if not self._spin_pathspec:
|
|
2381
|
+
num = self.task.results["_foreach_num_splits"]
|
|
2382
|
+
if num:
|
|
2383
|
+
self.task.log(
|
|
2384
|
+
"Foreach yields %d child steps." % num,
|
|
2385
|
+
system_msg=True,
|
|
2386
|
+
pid=self._proc.pid,
|
|
2387
|
+
)
|
|
1794
2388
|
self.task.log(
|
|
1795
2389
|
"Task finished successfully.", system_msg=True, pid=self._proc.pid
|
|
1796
2390
|
)
|