ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/R.py +10 -7
- metaflow/__init__.py +40 -25
- metaflow/_vendor/imghdr/__init__.py +186 -0
- metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/typeguard/__init__.py +48 -0
- metaflow/_vendor/typeguard/_checkers.py +1070 -0
- metaflow/_vendor/typeguard/_config.py +108 -0
- metaflow/_vendor/typeguard/_decorators.py +233 -0
- metaflow/_vendor/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/typeguard/_functions.py +308 -0
- metaflow/_vendor/typeguard/_importhook.py +213 -0
- metaflow/_vendor/typeguard/_memo.py +48 -0
- metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
- metaflow/_vendor/typeguard/_suppression.py +86 -0
- metaflow/_vendor/typeguard/_transformer.py +1229 -0
- metaflow/_vendor/typeguard/_union_transformer.py +55 -0
- metaflow/_vendor/typeguard/_utils.py +173 -0
- metaflow/_vendor/typeguard/py.typed +0 -0
- metaflow/_vendor/typing_extensions.py +3641 -0
- metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
- metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
- metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
- metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
- metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
- metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
- metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
- metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
- metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
- metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
- metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
- metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
- metaflow/_vendor/yaml/__init__.py +427 -0
- metaflow/_vendor/yaml/composer.py +139 -0
- metaflow/_vendor/yaml/constructor.py +748 -0
- metaflow/_vendor/yaml/cyaml.py +101 -0
- metaflow/_vendor/yaml/dumper.py +62 -0
- metaflow/_vendor/yaml/emitter.py +1137 -0
- metaflow/_vendor/yaml/error.py +75 -0
- metaflow/_vendor/yaml/events.py +86 -0
- metaflow/_vendor/yaml/loader.py +63 -0
- metaflow/_vendor/yaml/nodes.py +49 -0
- metaflow/_vendor/yaml/parser.py +589 -0
- metaflow/_vendor/yaml/reader.py +185 -0
- metaflow/_vendor/yaml/representer.py +389 -0
- metaflow/_vendor/yaml/resolver.py +227 -0
- metaflow/_vendor/yaml/scanner.py +1435 -0
- metaflow/_vendor/yaml/serializer.py +111 -0
- metaflow/_vendor/yaml/tokens.py +104 -0
- metaflow/cards.py +5 -0
- metaflow/cli.py +331 -785
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +52 -0
- metaflow/cli_components/run_cmds.py +546 -0
- metaflow/cli_components/step_cmd.py +334 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +467 -73
- metaflow/client/filecache.py +75 -35
- metaflow/clone_util.py +7 -1
- metaflow/cmd/code/__init__.py +231 -0
- metaflow/cmd/develop/stub_generator.py +756 -288
- metaflow/cmd/develop/stubs.py +12 -28
- metaflow/cmd/main_cli.py +6 -4
- metaflow/cmd/make_wrapper.py +78 -0
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +41 -10
- metaflow/datastore/datastore_set.py +11 -2
- metaflow/datastore/flow_datastore.py +156 -10
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +154 -39
- metaflow/debug.py +5 -0
- metaflow/decorators.py +404 -78
- metaflow/exception.py +8 -2
- metaflow/extension_support/__init__.py +527 -376
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/extension_support/plugins.py +49 -31
- metaflow/flowspec.py +482 -33
- metaflow/graph.py +210 -42
- metaflow/includefile.py +84 -40
- metaflow/lint.py +141 -22
- metaflow/meta_files.py +13 -0
- metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
- metaflow/{metadata → metadata_provider}/metadata.py +86 -1
- metaflow/metaflow_config.py +175 -28
- metaflow/metaflow_config_funcs.py +51 -3
- metaflow/metaflow_current.py +4 -10
- metaflow/metaflow_environment.py +139 -53
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_profile.py +18 -0
- metaflow/metaflow_version.py +150 -66
- metaflow/mflog/__init__.py +4 -3
- metaflow/mflog/save_logs.py +2 -2
- metaflow/multicore_utils.py +31 -14
- metaflow/package/__init__.py +673 -0
- metaflow/packaging_sys/__init__.py +880 -0
- metaflow/packaging_sys/backend.py +128 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +99 -0
- metaflow/packaging_sys/utils.py +54 -0
- metaflow/packaging_sys/v1.py +527 -0
- metaflow/parameters.py +149 -28
- metaflow/plugins/__init__.py +74 -5
- metaflow/plugins/airflow/airflow.py +40 -25
- metaflow/plugins/airflow/airflow_cli.py +22 -5
- metaflow/plugins/airflow/airflow_decorator.py +1 -1
- metaflow/plugins/airflow/airflow_utils.py +5 -3
- metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
- metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
- metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
- metaflow/plugins/argo/argo_client.py +78 -33
- metaflow/plugins/argo/argo_events.py +6 -6
- metaflow/plugins/argo/argo_workflows.py +2410 -527
- metaflow/plugins/argo/argo_workflows_cli.py +571 -121
- metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
- metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
- metaflow/plugins/argo/capture_error.py +73 -0
- metaflow/plugins/argo/conditional_input_paths.py +35 -0
- metaflow/plugins/argo/exit_hooks.py +209 -0
- metaflow/plugins/argo/jobset_input_paths.py +15 -0
- metaflow/plugins/argo/param_val.py +19 -0
- metaflow/plugins/aws/aws_client.py +10 -3
- metaflow/plugins/aws/aws_utils.py +55 -2
- metaflow/plugins/aws/batch/batch.py +72 -5
- metaflow/plugins/aws/batch/batch_cli.py +33 -10
- metaflow/plugins/aws/batch/batch_client.py +4 -3
- metaflow/plugins/aws/batch/batch_decorator.py +102 -35
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +65 -8
- metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
- metaflow/plugins/azure/azure_exceptions.py +1 -1
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
- metaflow/plugins/azure/azure_tail.py +1 -1
- metaflow/plugins/azure/includefile_support.py +2 -0
- metaflow/plugins/cards/card_cli.py +66 -30
- metaflow/plugins/cards/card_creator.py +25 -1
- metaflow/plugins/cards/card_datastore.py +21 -49
- metaflow/plugins/cards/card_decorator.py +132 -8
- metaflow/plugins/cards/card_modules/basic.py +112 -17
- metaflow/plugins/cards/card_modules/bundle.css +1 -1
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
- metaflow/plugins/cards/card_modules/components.py +665 -28
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
- metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
- metaflow/plugins/cards/card_modules/main.css +1 -0
- metaflow/plugins/cards/card_modules/main.js +68 -49
- metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
- metaflow/plugins/cards/card_modules/test_cards.py +26 -12
- metaflow/plugins/cards/card_server.py +39 -14
- metaflow/plugins/cards/component_serializer.py +2 -9
- metaflow/plugins/cards/metadata.py +22 -0
- metaflow/plugins/catch_decorator.py +9 -0
- metaflow/plugins/datastores/azure_storage.py +10 -1
- metaflow/plugins/datastores/gs_storage.py +6 -2
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/local.py +2 -0
- metaflow/plugins/datatools/s3/s3.py +126 -75
- metaflow/plugins/datatools/s3/s3op.py +254 -121
- metaflow/plugins/env_escape/__init__.py +3 -3
- metaflow/plugins/env_escape/client_modules.py +102 -72
- metaflow/plugins/env_escape/server.py +7 -0
- metaflow/plugins/env_escape/stub.py +24 -5
- metaflow/plugins/events_decorator.py +343 -185
- metaflow/plugins/exit_hook/__init__.py +0 -0
- metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
- metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/gcp/gs_tail.py +10 -6
- metaflow/plugins/gcp/includefile_support.py +3 -0
- metaflow/plugins/kubernetes/kube_utils.py +108 -0
- metaflow/plugins/kubernetes/kubernetes.py +411 -130
- metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
- metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
- metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
- metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/logs_cli.py +359 -0
- metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
- metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/plugins/package_cli.py +36 -24
- metaflow/plugins/parallel_decorator.py +128 -11
- metaflow/plugins/parsers.py +16 -0
- metaflow/plugins/project_decorator.py +51 -5
- metaflow/plugins/pypi/bootstrap.py +357 -105
- metaflow/plugins/pypi/conda_decorator.py +82 -81
- metaflow/plugins/pypi/conda_environment.py +187 -52
- metaflow/plugins/pypi/micromamba.py +157 -47
- metaflow/plugins/pypi/parsers.py +268 -0
- metaflow/plugins/pypi/pip.py +88 -13
- metaflow/plugins/pypi/pypi_decorator.py +37 -1
- metaflow/plugins/pypi/utils.py +48 -2
- metaflow/plugins/resources_decorator.py +2 -2
- metaflow/plugins/secrets/__init__.py +3 -0
- metaflow/plugins/secrets/secrets_decorator.py +26 -181
- metaflow/plugins/secrets/secrets_func.py +49 -0
- metaflow/plugins/secrets/secrets_spec.py +101 -0
- metaflow/plugins/secrets/utils.py +74 -0
- metaflow/plugins/tag_cli.py +4 -7
- metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
- metaflow/plugins/timeout_decorator.py +3 -3
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +128 -0
- metaflow/plugins/uv/uv_environment.py +72 -0
- metaflow/procpoll.py +1 -1
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/__init__.py +0 -0
- metaflow/runner/click_api.py +717 -0
- metaflow/runner/deployer.py +470 -0
- metaflow/runner/deployer_impl.py +201 -0
- metaflow/runner/metaflow_runner.py +714 -0
- metaflow/runner/nbdeploy.py +132 -0
- metaflow/runner/nbrun.py +225 -0
- metaflow/runner/subprocess_manager.py +650 -0
- metaflow/runner/utils.py +335 -0
- metaflow/runtime.py +1078 -260
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/system/__init__.py +5 -0
- metaflow/system/system_logger.py +85 -0
- metaflow/system/system_monitor.py +108 -0
- metaflow/system/system_utils.py +19 -0
- metaflow/task.py +521 -225
- metaflow/tracing/__init__.py +7 -7
- metaflow/tracing/span_exporter.py +31 -38
- metaflow/tracing/tracing_modules.py +38 -43
- metaflow/tuple_util.py +27 -0
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_options.py +563 -0
- metaflow/user_configs/config_parameters.py +598 -0
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +512 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +264 -0
- metaflow/user_decorators/user_step_decorator.py +749 -0
- metaflow/util.py +243 -27
- metaflow/vendor.py +23 -7
- metaflow/version.py +1 -1
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
- ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
- ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
- metaflow/_vendor/v3_5/__init__.py +0 -1
- metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
- metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
- metaflow/package.py +0 -188
- ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
- ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
- /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
- /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
- /metaflow/{metadata → metadata_provider}/util.py +0 -0
- /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
metaflow/runtime.py
CHANGED
|
@@ -4,39 +4,61 @@ Local backend
|
|
|
4
4
|
Execute the flow with a native runtime
|
|
5
5
|
using local / remote processes
|
|
6
6
|
"""
|
|
7
|
+
|
|
7
8
|
from __future__ import print_function
|
|
9
|
+
import json
|
|
8
10
|
import os
|
|
9
11
|
import sys
|
|
10
12
|
import fcntl
|
|
13
|
+
import re
|
|
14
|
+
import tempfile
|
|
11
15
|
import time
|
|
12
16
|
import subprocess
|
|
13
17
|
from datetime import datetime
|
|
18
|
+
from enum import Enum
|
|
14
19
|
from io import BytesIO
|
|
20
|
+
from itertools import chain
|
|
15
21
|
from functools import partial
|
|
16
22
|
from concurrent import futures
|
|
17
23
|
|
|
24
|
+
from typing import Dict, Tuple
|
|
18
25
|
from metaflow.datastore.exceptions import DataException
|
|
26
|
+
from contextlib import contextmanager
|
|
19
27
|
|
|
20
28
|
from . import get_namespace
|
|
21
|
-
from .
|
|
22
|
-
from .
|
|
29
|
+
from .client.filecache import FileCache, FileBlobCache, TaskMetadataCache
|
|
30
|
+
from .metadata_provider import MetaDatum
|
|
31
|
+
from .metaflow_config import (
|
|
32
|
+
FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
|
|
33
|
+
MAX_ATTEMPTS,
|
|
34
|
+
UI_URL,
|
|
35
|
+
SPIN_ALLOWED_DECORATORS,
|
|
36
|
+
SPIN_DISALLOWED_DECORATORS,
|
|
37
|
+
)
|
|
38
|
+
from .metaflow_profile import from_start
|
|
39
|
+
from .plugins import DATASTORES
|
|
23
40
|
from .exception import (
|
|
24
41
|
MetaflowException,
|
|
25
42
|
MetaflowInternalError,
|
|
26
43
|
METAFLOW_EXIT_DISALLOW_RETRY,
|
|
27
44
|
)
|
|
28
45
|
from . import procpoll
|
|
29
|
-
from .datastore import TaskDataStoreSet
|
|
46
|
+
from .datastore import FlowDataStore, TaskDataStoreSet
|
|
30
47
|
from .debug import debug
|
|
31
48
|
from .decorators import flow_decorators
|
|
49
|
+
from .flowspec import FlowStateItems
|
|
32
50
|
from .mflog import mflog, RUNTIME_LOG_SOURCE
|
|
33
|
-
from .util import to_unicode, compress_list, unicode_type
|
|
51
|
+
from .util import to_unicode, compress_list, unicode_type, get_latest_task_pathspec
|
|
34
52
|
from .clone_util import clone_task_helper
|
|
35
53
|
from .unbounded_foreach import (
|
|
36
54
|
CONTROL_TASK_TAG,
|
|
37
55
|
UBF_CONTROL,
|
|
38
56
|
UBF_TASK,
|
|
39
57
|
)
|
|
58
|
+
|
|
59
|
+
from .user_configs.config_options import ConfigInput
|
|
60
|
+
from .user_configs.config_parameters import dump_config_values
|
|
61
|
+
|
|
40
62
|
import metaflow.tracing as tracing
|
|
41
63
|
|
|
42
64
|
MAX_WORKERS = 16
|
|
@@ -47,9 +69,24 @@ PROGRESS_INTERVAL = 300 # s
|
|
|
47
69
|
# The following is a list of the (data) artifacts used by the runtime while
|
|
48
70
|
# executing a flow. These are prefetched during the resume operation by
|
|
49
71
|
# leveraging the TaskDataStoreSet.
|
|
50
|
-
PREFETCH_DATA_ARTIFACTS = [
|
|
72
|
+
PREFETCH_DATA_ARTIFACTS = [
|
|
73
|
+
"_foreach_stack",
|
|
74
|
+
"_iteration_stack",
|
|
75
|
+
"_task_ok",
|
|
76
|
+
"_transition",
|
|
77
|
+
"_control_mapper_tasks",
|
|
78
|
+
"_control_task_is_mapper_zero",
|
|
79
|
+
]
|
|
51
80
|
RESUME_POLL_SECONDS = 60
|
|
52
81
|
|
|
82
|
+
|
|
83
|
+
class LoopBehavior(Enum):
|
|
84
|
+
NONE = "none"
|
|
85
|
+
ENTERING = "entering"
|
|
86
|
+
EXITING = "exiting"
|
|
87
|
+
LOOPING = "looping"
|
|
88
|
+
|
|
89
|
+
|
|
53
90
|
# Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
|
|
54
91
|
# formats according to mflog. See a comment in mflog.__init__
|
|
55
92
|
mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
|
|
@@ -57,6 +94,253 @@ mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
|
|
|
57
94
|
# TODO option: output dot graph periodically about execution
|
|
58
95
|
|
|
59
96
|
|
|
97
|
+
class SpinRuntime(object):
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
flow,
|
|
101
|
+
graph,
|
|
102
|
+
flow_datastore,
|
|
103
|
+
metadata,
|
|
104
|
+
environment,
|
|
105
|
+
package,
|
|
106
|
+
logger,
|
|
107
|
+
entrypoint,
|
|
108
|
+
event_logger,
|
|
109
|
+
monitor,
|
|
110
|
+
step_func,
|
|
111
|
+
step_name,
|
|
112
|
+
spin_pathspec,
|
|
113
|
+
skip_decorators=False,
|
|
114
|
+
artifacts_module=None,
|
|
115
|
+
persist=True,
|
|
116
|
+
max_log_size=MAX_LOG_SIZE,
|
|
117
|
+
):
|
|
118
|
+
from metaflow import Task
|
|
119
|
+
|
|
120
|
+
self._flow = flow
|
|
121
|
+
self._graph = graph
|
|
122
|
+
self._flow_datastore = flow_datastore
|
|
123
|
+
self._metadata = metadata
|
|
124
|
+
self._environment = environment
|
|
125
|
+
self._package = package
|
|
126
|
+
self._logger = logger
|
|
127
|
+
self._entrypoint = entrypoint
|
|
128
|
+
self._event_logger = event_logger
|
|
129
|
+
self._monitor = monitor
|
|
130
|
+
|
|
131
|
+
self._step_func = step_func
|
|
132
|
+
|
|
133
|
+
# Determine if we have a complete pathspec or need to get the task
|
|
134
|
+
if spin_pathspec:
|
|
135
|
+
parts = spin_pathspec.split("/")
|
|
136
|
+
if len(parts) == 4:
|
|
137
|
+
# Complete pathspec: flow/run/step/task_id
|
|
138
|
+
try:
|
|
139
|
+
# If user provides whole pathspec, we do not need to check namespace
|
|
140
|
+
task = Task(spin_pathspec, _namespace_check=False)
|
|
141
|
+
except Exception:
|
|
142
|
+
raise MetaflowException(
|
|
143
|
+
f"Invalid pathspec: {spin_pathspec} for step: {step_name}"
|
|
144
|
+
)
|
|
145
|
+
elif len(parts) == 3:
|
|
146
|
+
# Partial pathspec: flow/run/step - need to get the task
|
|
147
|
+
_, run_id, _ = parts
|
|
148
|
+
task = get_latest_task_pathspec(flow.name, step_name, run_id=run_id)
|
|
149
|
+
logger(
|
|
150
|
+
f"To make spin even faster, provide complete pathspec with task_id: {task.pathspec}",
|
|
151
|
+
system_msg=True,
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
raise MetaflowException(
|
|
155
|
+
f"Invalid pathspec format: {spin_pathspec}. Expected flow/run/step or flow/run/step/task_id"
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
# No pathspec provided, get latest task for this step
|
|
159
|
+
task = get_latest_task_pathspec(flow.name, step_name)
|
|
160
|
+
logger(
|
|
161
|
+
f"To make spin even faster, provide complete pathspec {task.pathspec}",
|
|
162
|
+
system_msg=True,
|
|
163
|
+
)
|
|
164
|
+
from_start("SpinRuntime: after getting task")
|
|
165
|
+
|
|
166
|
+
# Get the original FlowDatastore so we can use it to access artifacts from the
|
|
167
|
+
# spun task
|
|
168
|
+
meta_dict = task.metadata_dict
|
|
169
|
+
ds_type = meta_dict["ds-type"]
|
|
170
|
+
ds_root = meta_dict["ds-root"]
|
|
171
|
+
orig_datastore_impl = [d for d in DATASTORES if d.TYPE == ds_type][0]
|
|
172
|
+
orig_datastore_impl.datastore_root = ds_root
|
|
173
|
+
spin_pathspec = task.pathspec
|
|
174
|
+
orig_flow_datastore = FlowDataStore(
|
|
175
|
+
flow.name,
|
|
176
|
+
environment=None,
|
|
177
|
+
storage_impl=orig_datastore_impl,
|
|
178
|
+
ds_root=ds_root,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._filecache = FileCache()
|
|
182
|
+
orig_flow_datastore.set_metadata_cache(
|
|
183
|
+
TaskMetadataCache(self._filecache, ds_type, ds_root, flow.name)
|
|
184
|
+
)
|
|
185
|
+
orig_flow_datastore.ca_store.set_blob_cache(
|
|
186
|
+
FileBlobCache(
|
|
187
|
+
self._filecache, FileCache.flow_ds_id(ds_type, ds_root, flow.name)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
self._orig_flow_datastore = orig_flow_datastore
|
|
192
|
+
self._spin_pathspec = spin_pathspec
|
|
193
|
+
self._persist = persist
|
|
194
|
+
self._spin_task = task
|
|
195
|
+
self._input_paths = None
|
|
196
|
+
self._split_index = None
|
|
197
|
+
self._whitelist_decorators = None
|
|
198
|
+
self._config_file_name = None
|
|
199
|
+
self._skip_decorators = skip_decorators
|
|
200
|
+
self._artifacts_module = artifacts_module
|
|
201
|
+
self._max_log_size = max_log_size
|
|
202
|
+
self._encoding = sys.stdout.encoding or "UTF-8"
|
|
203
|
+
|
|
204
|
+
# Create a new run_id for the spin task
|
|
205
|
+
self.run_id = self._metadata.new_run_id()
|
|
206
|
+
# Raise exception if we have a black listed decorator
|
|
207
|
+
for deco in self._step_func.decorators:
|
|
208
|
+
if deco.name in SPIN_DISALLOWED_DECORATORS:
|
|
209
|
+
raise MetaflowException(
|
|
210
|
+
f"Spinning steps with @{deco.name} decorator is not supported."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
for deco in self.whitelist_decorators:
|
|
214
|
+
deco.runtime_init(flow, graph, package, self.run_id)
|
|
215
|
+
from_start("SpinRuntime: after init decorators")
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def split_index(self):
|
|
219
|
+
"""
|
|
220
|
+
Returns the split index, caching the result after the first access.
|
|
221
|
+
"""
|
|
222
|
+
if self._split_index is None:
|
|
223
|
+
self._split_index = getattr(self._spin_task, "index", None)
|
|
224
|
+
|
|
225
|
+
return self._split_index
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def input_paths(self):
|
|
229
|
+
def _format_input_paths(task_pathspec, attempt):
|
|
230
|
+
_, run_id, step_name, task_id = task_pathspec.split("/")
|
|
231
|
+
return f"{run_id}/{step_name}/{task_id}/{attempt}"
|
|
232
|
+
|
|
233
|
+
if self._input_paths:
|
|
234
|
+
return self._input_paths
|
|
235
|
+
|
|
236
|
+
if self._step_func.name == "start":
|
|
237
|
+
from metaflow import Step
|
|
238
|
+
|
|
239
|
+
flow_name, run_id, _, _ = self._spin_pathspec.split("/")
|
|
240
|
+
task = Step(
|
|
241
|
+
f"{flow_name}/{run_id}/_parameters", _namespace_check=False
|
|
242
|
+
).task
|
|
243
|
+
self._input_paths = [
|
|
244
|
+
_format_input_paths(task.pathspec, task.current_attempt)
|
|
245
|
+
]
|
|
246
|
+
else:
|
|
247
|
+
parent_tasks = self._spin_task.parent_tasks
|
|
248
|
+
self._input_paths = [
|
|
249
|
+
_format_input_paths(t.pathspec, t.current_attempt) for t in parent_tasks
|
|
250
|
+
]
|
|
251
|
+
return self._input_paths
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def whitelist_decorators(self):
|
|
255
|
+
if self._skip_decorators:
|
|
256
|
+
self._whitelist_decorators = []
|
|
257
|
+
return self._whitelist_decorators
|
|
258
|
+
if self._whitelist_decorators:
|
|
259
|
+
return self._whitelist_decorators
|
|
260
|
+
self._whitelist_decorators = [
|
|
261
|
+
deco
|
|
262
|
+
for deco in self._step_func.decorators
|
|
263
|
+
if any(deco.name.startswith(prefix) for prefix in SPIN_ALLOWED_DECORATORS)
|
|
264
|
+
]
|
|
265
|
+
return self._whitelist_decorators
|
|
266
|
+
|
|
267
|
+
def _new_task(self, step, input_paths=None, **kwargs):
|
|
268
|
+
return Task(
|
|
269
|
+
flow_datastore=self._flow_datastore,
|
|
270
|
+
flow=self._flow,
|
|
271
|
+
step=step,
|
|
272
|
+
run_id=self.run_id,
|
|
273
|
+
metadata=self._metadata,
|
|
274
|
+
environment=self._environment,
|
|
275
|
+
entrypoint=self._entrypoint,
|
|
276
|
+
event_logger=self._event_logger,
|
|
277
|
+
monitor=self._monitor,
|
|
278
|
+
input_paths=input_paths,
|
|
279
|
+
decos=self.whitelist_decorators,
|
|
280
|
+
logger=self._logger,
|
|
281
|
+
split_index=self.split_index,
|
|
282
|
+
**kwargs,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def execute(self):
|
|
286
|
+
exception = None
|
|
287
|
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
|
|
288
|
+
config_value = dump_config_values(self._flow)
|
|
289
|
+
if config_value:
|
|
290
|
+
json.dump(config_value, config_file)
|
|
291
|
+
config_file.flush()
|
|
292
|
+
self._config_file_name = config_file.name
|
|
293
|
+
else:
|
|
294
|
+
self._config_file_name = None
|
|
295
|
+
from_start("SpinRuntime: config values processed")
|
|
296
|
+
self.task = self._new_task(self._step_func.name, self.input_paths)
|
|
297
|
+
try:
|
|
298
|
+
self._launch_and_monitor_task()
|
|
299
|
+
except Exception as ex:
|
|
300
|
+
self._logger("Task failed.", system_msg=True, bad=True)
|
|
301
|
+
exception = ex
|
|
302
|
+
raise
|
|
303
|
+
finally:
|
|
304
|
+
for deco in self.whitelist_decorators:
|
|
305
|
+
deco.runtime_finished(exception)
|
|
306
|
+
|
|
307
|
+
def _launch_and_monitor_task(self):
|
|
308
|
+
worker = Worker(
|
|
309
|
+
self.task,
|
|
310
|
+
self._max_log_size,
|
|
311
|
+
self._config_file_name,
|
|
312
|
+
orig_flow_datastore=self._orig_flow_datastore,
|
|
313
|
+
spin_pathspec=self._spin_pathspec,
|
|
314
|
+
artifacts_module=self._artifacts_module,
|
|
315
|
+
persist=self._persist,
|
|
316
|
+
skip_decorators=self._skip_decorators,
|
|
317
|
+
)
|
|
318
|
+
from_start("SpinRuntime: created worker")
|
|
319
|
+
|
|
320
|
+
poll = procpoll.make_poll()
|
|
321
|
+
fds = worker.fds()
|
|
322
|
+
for fd in fds:
|
|
323
|
+
poll.add(fd)
|
|
324
|
+
|
|
325
|
+
active_fds = set(fds)
|
|
326
|
+
|
|
327
|
+
while active_fds:
|
|
328
|
+
events = poll.poll(POLL_TIMEOUT)
|
|
329
|
+
for event in events:
|
|
330
|
+
if event.can_read:
|
|
331
|
+
worker.read_logline(event.fd)
|
|
332
|
+
if event.is_terminated:
|
|
333
|
+
poll.remove(event.fd)
|
|
334
|
+
active_fds.remove(event.fd)
|
|
335
|
+
from_start("SpinRuntime: read loglines")
|
|
336
|
+
returncode = worker.terminate()
|
|
337
|
+
from_start("SpinRuntime: worker terminated")
|
|
338
|
+
if returncode != 0:
|
|
339
|
+
raise TaskFailed(self.task, f"Task failed with return code {returncode}")
|
|
340
|
+
else:
|
|
341
|
+
self._logger("Task finished successfully.", system_msg=True)
|
|
342
|
+
|
|
343
|
+
|
|
60
344
|
class NativeRuntime(object):
|
|
61
345
|
def __init__(
|
|
62
346
|
self,
|
|
@@ -74,11 +358,12 @@ class NativeRuntime(object):
|
|
|
74
358
|
clone_run_id=None,
|
|
75
359
|
clone_only=False,
|
|
76
360
|
reentrant=False,
|
|
77
|
-
|
|
361
|
+
steps_to_rerun=None,
|
|
78
362
|
max_workers=MAX_WORKERS,
|
|
79
363
|
max_num_splits=MAX_NUM_SPLITS,
|
|
80
364
|
max_log_size=MAX_LOG_SIZE,
|
|
81
365
|
resume_identifier=None,
|
|
366
|
+
skip_decorator_hooks=False,
|
|
82
367
|
):
|
|
83
368
|
if run_id is None:
|
|
84
369
|
self._run_id = metadata.new_run_id()
|
|
@@ -91,6 +376,7 @@ class NativeRuntime(object):
|
|
|
91
376
|
self._flow_datastore = flow_datastore
|
|
92
377
|
self._metadata = metadata
|
|
93
378
|
self._environment = environment
|
|
379
|
+
self._package = package
|
|
94
380
|
self._logger = logger
|
|
95
381
|
self._max_workers = max_workers
|
|
96
382
|
self._active_tasks = dict() # Key: step name;
|
|
@@ -108,9 +394,21 @@ class NativeRuntime(object):
|
|
|
108
394
|
|
|
109
395
|
self._clone_run_id = clone_run_id
|
|
110
396
|
self._clone_only = clone_only
|
|
111
|
-
self.
|
|
397
|
+
self._cloned_tasks = []
|
|
398
|
+
self._ran_or_scheduled_task_index = set()
|
|
112
399
|
self._reentrant = reentrant
|
|
113
400
|
self._run_url = None
|
|
401
|
+
self._skip_decorator_hooks = skip_decorator_hooks
|
|
402
|
+
|
|
403
|
+
# If steps_to_rerun is specified, we will not clone them in resume mode.
|
|
404
|
+
self._steps_to_rerun = steps_to_rerun or {}
|
|
405
|
+
# sorted_nodes are in topological order already, so we only need to
|
|
406
|
+
# iterate through the nodes once to get a stable set of rerun steps.
|
|
407
|
+
for step_name in self._graph.sorted_nodes:
|
|
408
|
+
if step_name in self._steps_to_rerun:
|
|
409
|
+
out_funcs = self._graph[step_name].out_funcs or []
|
|
410
|
+
for next_step in out_funcs:
|
|
411
|
+
self._steps_to_rerun.add(next_step)
|
|
114
412
|
|
|
115
413
|
self._origin_ds_set = None
|
|
116
414
|
if clone_run_id:
|
|
@@ -152,21 +450,21 @@ class NativeRuntime(object):
|
|
|
152
450
|
# finished.
|
|
153
451
|
self._control_num_splits = {} # control_task -> num_splits mapping
|
|
154
452
|
|
|
155
|
-
|
|
156
|
-
for
|
|
157
|
-
deco
|
|
453
|
+
if not self._skip_decorator_hooks:
|
|
454
|
+
for step in flow:
|
|
455
|
+
for deco in step.decorators:
|
|
456
|
+
deco.runtime_init(flow, graph, package, self._run_id)
|
|
158
457
|
|
|
159
458
|
def _new_task(self, step, input_paths=None, **kwargs):
|
|
160
|
-
|
|
161
459
|
if input_paths is None:
|
|
162
460
|
may_clone = True
|
|
163
461
|
else:
|
|
164
462
|
may_clone = all(self._is_cloned[path] for path in input_paths)
|
|
165
463
|
|
|
166
|
-
if step in self.
|
|
464
|
+
if step in self._steps_to_rerun:
|
|
167
465
|
may_clone = False
|
|
168
466
|
|
|
169
|
-
if step == "_parameters":
|
|
467
|
+
if step == "_parameters" or self._skip_decorator_hooks:
|
|
170
468
|
decos = []
|
|
171
469
|
else:
|
|
172
470
|
decos = getattr(self._flow, step).decorators
|
|
@@ -204,6 +502,22 @@ class NativeRuntime(object):
|
|
|
204
502
|
|
|
205
503
|
self._is_cloned[self._params_task.path] = self._params_task.is_cloned
|
|
206
504
|
|
|
505
|
+
def should_skip_clone_only_execution(self):
|
|
506
|
+
(
|
|
507
|
+
should_skip_clone_only_execution,
|
|
508
|
+
skip_reason,
|
|
509
|
+
) = self._should_skip_clone_only_execution()
|
|
510
|
+
if should_skip_clone_only_execution:
|
|
511
|
+
self._logger(skip_reason, system_msg=True)
|
|
512
|
+
return True
|
|
513
|
+
return False
|
|
514
|
+
|
|
515
|
+
@contextmanager
|
|
516
|
+
def run_heartbeat(self):
|
|
517
|
+
self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
|
|
518
|
+
yield
|
|
519
|
+
self._metadata.stop_heartbeat()
|
|
520
|
+
|
|
207
521
|
def print_workflow_info(self):
|
|
208
522
|
self._run_url = (
|
|
209
523
|
"%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
|
|
@@ -236,157 +550,375 @@ class NativeRuntime(object):
|
|
|
236
550
|
)
|
|
237
551
|
return False, None
|
|
238
552
|
|
|
239
|
-
def clone_task(
|
|
240
|
-
self
|
|
241
|
-
|
|
553
|
+
def clone_task(
|
|
554
|
+
self,
|
|
555
|
+
step_name,
|
|
556
|
+
task_id,
|
|
557
|
+
pathspec_index,
|
|
558
|
+
cloned_task_pathspec_index,
|
|
559
|
+
finished_tuple,
|
|
560
|
+
iteration_tuple,
|
|
561
|
+
ubf_context,
|
|
562
|
+
generate_task_obj,
|
|
563
|
+
verbose=False,
|
|
564
|
+
):
|
|
565
|
+
try:
|
|
566
|
+
new_task_id = task_id
|
|
567
|
+
if generate_task_obj:
|
|
568
|
+
task = self._new_task(step_name, pathspec_index=pathspec_index)
|
|
569
|
+
if ubf_context:
|
|
570
|
+
task.ubf_context = ubf_context
|
|
571
|
+
new_task_id = task.task_id
|
|
572
|
+
self._cloned_tasks.append(task)
|
|
573
|
+
self._ran_or_scheduled_task_index.add(cloned_task_pathspec_index)
|
|
574
|
+
task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
|
|
575
|
+
else:
|
|
576
|
+
task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
|
|
577
|
+
Task.clone_pathspec_mapping[task_pathspec] = "{}/{}/{}".format(
|
|
578
|
+
self._clone_run_id, step_name, task_id
|
|
579
|
+
)
|
|
580
|
+
if verbose:
|
|
581
|
+
self._logger(
|
|
582
|
+
"Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
|
|
583
|
+
self._flow.name,
|
|
584
|
+
self._clone_run_id,
|
|
585
|
+
step_name,
|
|
586
|
+
task_id,
|
|
587
|
+
self._flow.name,
|
|
588
|
+
self._run_id,
|
|
589
|
+
step_name,
|
|
590
|
+
new_task_id,
|
|
591
|
+
),
|
|
592
|
+
system_msg=True,
|
|
593
|
+
)
|
|
594
|
+
clone_task_helper(
|
|
242
595
|
self._flow.name,
|
|
243
596
|
self._clone_run_id,
|
|
244
|
-
step_name,
|
|
245
|
-
task_id,
|
|
246
|
-
self._flow.name,
|
|
247
597
|
self._run_id,
|
|
248
598
|
step_name,
|
|
249
|
-
task_id,
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
self.
|
|
256
|
-
self.
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
)
|
|
599
|
+
task_id, # origin_task_id
|
|
600
|
+
new_task_id,
|
|
601
|
+
self._flow_datastore,
|
|
602
|
+
self._metadata,
|
|
603
|
+
origin_ds_set=self._origin_ds_set,
|
|
604
|
+
)
|
|
605
|
+
self._finished[(step_name, finished_tuple, iteration_tuple)] = task_pathspec
|
|
606
|
+
self._is_cloned[task_pathspec] = True
|
|
607
|
+
except Exception as e:
|
|
608
|
+
self._logger(
|
|
609
|
+
"Cloning {}/{}/{}/{} failed with error: {}".format(
|
|
610
|
+
self._flow.name, self._clone_run_id, step_name, task_id, str(e)
|
|
611
|
+
)
|
|
612
|
+
)
|
|
264
613
|
|
|
265
|
-
def clone_original_run(self):
|
|
266
|
-
(
|
|
267
|
-
should_skip_clone_only_execution,
|
|
268
|
-
skip_reason,
|
|
269
|
-
) = self._should_skip_clone_only_execution()
|
|
270
|
-
if should_skip_clone_only_execution:
|
|
271
|
-
self._logger(skip_reason, system_msg=True)
|
|
272
|
-
return
|
|
273
|
-
self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
|
|
614
|
+
def clone_original_run(self, generate_task_obj=False, verbose=True):
|
|
274
615
|
self._logger(
|
|
275
|
-
"
|
|
276
|
-
self._flow.name, self._clone_run_id
|
|
277
|
-
),
|
|
616
|
+
"Cloning {}/{}".format(self._flow.name, self._clone_run_id),
|
|
278
617
|
system_msg=True,
|
|
279
618
|
)
|
|
280
619
|
|
|
281
620
|
inputs = []
|
|
282
621
|
|
|
622
|
+
ubf_mapper_tasks_to_clone = set()
|
|
623
|
+
ubf_control_tasks = set()
|
|
624
|
+
# We only clone ubf mapper tasks if the control task is complete.
|
|
625
|
+
# Here we need to check which control tasks are complete, and then get the corresponding
|
|
626
|
+
# mapper tasks.
|
|
283
627
|
for task_ds in self._origin_ds_set:
|
|
284
628
|
_, step_name, task_id = task_ds.pathspec.split("/")
|
|
629
|
+
pathspec_index = task_ds.pathspec_index
|
|
285
630
|
if task_ds["_task_ok"] and step_name != "_parameters":
|
|
286
|
-
|
|
631
|
+
# Control task contains "_control_mapper_tasks" but, in the case of
|
|
632
|
+
# @parallel decorator, the control task is also a mapper task so we
|
|
633
|
+
# need to distinguish this using _control_task_is_mapper_zero
|
|
634
|
+
control_mapper_tasks = (
|
|
635
|
+
[]
|
|
636
|
+
if "_control_mapper_tasks" not in task_ds
|
|
637
|
+
else task_ds["_control_mapper_tasks"]
|
|
638
|
+
)
|
|
639
|
+
if control_mapper_tasks:
|
|
640
|
+
if task_ds.get("_control_task_is_mapper_zero", False):
|
|
641
|
+
# Strip out the control task of list of mapper tasks
|
|
642
|
+
ubf_control_tasks.add(control_mapper_tasks[0])
|
|
643
|
+
ubf_mapper_tasks_to_clone.update(control_mapper_tasks[1:])
|
|
644
|
+
else:
|
|
645
|
+
ubf_mapper_tasks_to_clone.update(control_mapper_tasks)
|
|
646
|
+
# Since we only add mapper tasks here, if we are not in the list
|
|
647
|
+
# we are a control task
|
|
648
|
+
if task_ds.pathspec not in ubf_mapper_tasks_to_clone:
|
|
649
|
+
ubf_control_tasks.add(task_ds.pathspec)
|
|
650
|
+
|
|
651
|
+
for task_ds in self._origin_ds_set:
|
|
652
|
+
_, step_name, task_id = task_ds.pathspec.split("/")
|
|
653
|
+
pathspec_index = task_ds.pathspec_index
|
|
654
|
+
|
|
655
|
+
if (
|
|
656
|
+
task_ds["_task_ok"]
|
|
657
|
+
and step_name != "_parameters"
|
|
658
|
+
and (step_name not in self._steps_to_rerun)
|
|
659
|
+
):
|
|
660
|
+
# "_unbounded_foreach" is a special flag to indicate that the transition
|
|
661
|
+
# is an unbounded foreach.
|
|
662
|
+
# Both parent and splitted children tasks will have this flag set.
|
|
663
|
+
# The splitted control/mapper tasks
|
|
664
|
+
# are not foreach types because UBF is always followed by a join step.
|
|
665
|
+
is_ubf_task = (
|
|
666
|
+
"_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
|
|
667
|
+
) and (self._graph[step_name].type != "foreach")
|
|
668
|
+
|
|
669
|
+
is_ubf_control_task = task_ds.pathspec in ubf_control_tasks
|
|
670
|
+
|
|
671
|
+
is_ubf_mapper_task = is_ubf_task and (not is_ubf_control_task)
|
|
672
|
+
|
|
673
|
+
if is_ubf_mapper_task and (
|
|
674
|
+
task_ds.pathspec not in ubf_mapper_tasks_to_clone
|
|
675
|
+
):
|
|
676
|
+
# Skip copying UBF mapper tasks if control task is incomplete.
|
|
677
|
+
continue
|
|
678
|
+
|
|
679
|
+
ubf_context = None
|
|
680
|
+
if is_ubf_task:
|
|
681
|
+
ubf_context = "ubf_test" if is_ubf_mapper_task else "ubf_control"
|
|
682
|
+
|
|
683
|
+
finished_tuple = tuple(
|
|
684
|
+
[s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
|
|
685
|
+
)
|
|
686
|
+
iteration_tuple = tuple(task_ds.get("_iteration_stack", ()))
|
|
687
|
+
cloned_task_pathspec_index = pathspec_index.split("/")[1]
|
|
688
|
+
if task_ds.get("_control_task_is_mapper_zero", False):
|
|
689
|
+
# Replace None with index 0 for control task as it is part of the
|
|
690
|
+
# UBF (as a mapper as well)
|
|
691
|
+
finished_tuple = finished_tuple[:-1] + (
|
|
692
|
+
finished_tuple[-1]._replace(index=0),
|
|
693
|
+
)
|
|
694
|
+
# We need this reverse override though because when we check
|
|
695
|
+
# if a task has been cloned in _queue_push, the index will be None
|
|
696
|
+
# because the _control_task_is_mapper_zero is set in the control
|
|
697
|
+
# task *itself* and *not* in the one that is launching the UBF nest.
|
|
698
|
+
# This means that _translate_index will use None.
|
|
699
|
+
cloned_task_pathspec_index = re.sub(
|
|
700
|
+
r"(\[(?:\d+, ?)*)0\]",
|
|
701
|
+
lambda m: (m.group(1) or "[") + "None]",
|
|
702
|
+
cloned_task_pathspec_index,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
inputs.append(
|
|
706
|
+
(
|
|
707
|
+
step_name,
|
|
708
|
+
task_id,
|
|
709
|
+
pathspec_index,
|
|
710
|
+
cloned_task_pathspec_index,
|
|
711
|
+
finished_tuple,
|
|
712
|
+
iteration_tuple,
|
|
713
|
+
is_ubf_mapper_task,
|
|
714
|
+
ubf_context,
|
|
715
|
+
)
|
|
716
|
+
)
|
|
287
717
|
|
|
288
718
|
with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
|
|
289
719
|
all_tasks = [
|
|
290
|
-
executor.submit(
|
|
291
|
-
|
|
720
|
+
executor.submit(
|
|
721
|
+
self.clone_task,
|
|
722
|
+
step_name,
|
|
723
|
+
task_id,
|
|
724
|
+
pathspec_index,
|
|
725
|
+
cloned_task_pathspec_index,
|
|
726
|
+
finished_tuple,
|
|
727
|
+
iteration_tuple,
|
|
728
|
+
ubf_context=ubf_context,
|
|
729
|
+
generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
|
|
730
|
+
verbose=verbose,
|
|
731
|
+
)
|
|
732
|
+
for (
|
|
733
|
+
step_name,
|
|
734
|
+
task_id,
|
|
735
|
+
pathspec_index,
|
|
736
|
+
cloned_task_pathspec_index,
|
|
737
|
+
finished_tuple,
|
|
738
|
+
iteration_tuple,
|
|
739
|
+
is_ubf_mapper_task,
|
|
740
|
+
ubf_context,
|
|
741
|
+
) in inputs
|
|
292
742
|
]
|
|
293
743
|
_, _ = futures.wait(all_tasks)
|
|
294
|
-
self._logger(
|
|
744
|
+
self._logger(
|
|
745
|
+
"{}/{} cloned!".format(self._flow.name, self._clone_run_id), system_msg=True
|
|
746
|
+
)
|
|
295
747
|
self._params_task.mark_resume_done()
|
|
296
|
-
self._metadata.stop_heartbeat()
|
|
297
748
|
|
|
298
749
|
def execute(self):
|
|
299
|
-
(
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
if should_skip_clone_only_execution:
|
|
304
|
-
self._logger(skip_reason, system_msg=True)
|
|
305
|
-
return
|
|
306
|
-
self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
|
|
307
|
-
|
|
308
|
-
if self._params_task:
|
|
309
|
-
self._queue_push("start", {"input_paths": [self._params_task.path]})
|
|
750
|
+
if len(self._cloned_tasks) > 0:
|
|
751
|
+
# mutable list storing the cloned tasks.
|
|
752
|
+
self._run_queue = []
|
|
753
|
+
self._active_tasks[0] = 0
|
|
310
754
|
else:
|
|
311
|
-
self.
|
|
755
|
+
if self._params_task:
|
|
756
|
+
self._queue_push("start", {"input_paths": [self._params_task.path]})
|
|
757
|
+
else:
|
|
758
|
+
self._queue_push("start", {})
|
|
312
759
|
|
|
313
760
|
progress_tstamp = time.time()
|
|
314
|
-
|
|
315
|
-
#
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
self.
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
761
|
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
|
|
762
|
+
# Configurations are passed through a file to avoid overloading the
|
|
763
|
+
# command-line. We only need to create this file once and it can be reused
|
|
764
|
+
# for any task launch
|
|
765
|
+
config_value = dump_config_values(self._flow)
|
|
766
|
+
if config_value:
|
|
767
|
+
json.dump(config_value, config_file)
|
|
768
|
+
config_file.flush()
|
|
769
|
+
self._config_file_name = config_file.name
|
|
770
|
+
else:
|
|
771
|
+
self._config_file_name = None
|
|
772
|
+
try:
|
|
773
|
+
# main scheduling loop
|
|
774
|
+
exception = None
|
|
775
|
+
while (
|
|
776
|
+
self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks
|
|
777
|
+
):
|
|
778
|
+
# 1. are any of the current workers finished?
|
|
779
|
+
if self._cloned_tasks:
|
|
780
|
+
finished_tasks = []
|
|
781
|
+
|
|
782
|
+
# For loops (right now just recursive steps), we need to find
|
|
783
|
+
# the exact frontier because if we queue all "successors" to all
|
|
784
|
+
# the finished iterations, we would incorrectly launch multiple
|
|
785
|
+
# successors. We therefore have to strip out all non-last
|
|
786
|
+
# iterations *per* foreach branch.
|
|
787
|
+
idx_per_finished_id = (
|
|
788
|
+
{}
|
|
789
|
+
) # type: Dict[Tuple[str, Tuple[int, ...], Tuple[int, Tuple[int, ...]]]]
|
|
790
|
+
for task in self._cloned_tasks:
|
|
791
|
+
step_name, foreach_stack, iteration_stack = task.finished_id
|
|
792
|
+
existing_task_idx = idx_per_finished_id.get(
|
|
793
|
+
(step_name, foreach_stack), None
|
|
794
|
+
)
|
|
795
|
+
if existing_task_idx is not None:
|
|
796
|
+
len_diff = len(iteration_stack) - len(
|
|
797
|
+
existing_task_idx[1]
|
|
798
|
+
)
|
|
799
|
+
# In this case, we need to keep only the latest iteration
|
|
800
|
+
if (
|
|
801
|
+
len_diff == 0
|
|
802
|
+
and iteration_stack > existing_task_idx[1]
|
|
803
|
+
) or len_diff == -1:
|
|
804
|
+
# We remove the one we currently have and replace
|
|
805
|
+
# by this one. The second option means that we are
|
|
806
|
+
# adding the finished iteration marker.
|
|
807
|
+
existing_task = finished_tasks[existing_task_idx[0]]
|
|
808
|
+
# These are the first two lines of _queue_tasks
|
|
809
|
+
# We still consider the tasks finished so we need
|
|
810
|
+
# to update state to be clean.
|
|
811
|
+
self._finished[existing_task.finished_id] = (
|
|
812
|
+
existing_task.path
|
|
813
|
+
)
|
|
814
|
+
self._is_cloned[existing_task.path] = (
|
|
815
|
+
existing_task.is_cloned
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
finished_tasks[existing_task_idx[0]] = task
|
|
819
|
+
idx_per_finished_id[(step_name, foreach_stack)] = (
|
|
820
|
+
existing_task_idx[0],
|
|
821
|
+
iteration_stack,
|
|
822
|
+
)
|
|
823
|
+
elif (
|
|
824
|
+
len_diff == 0
|
|
825
|
+
and iteration_stack < existing_task_idx[1]
|
|
826
|
+
) or len_diff == 1:
|
|
827
|
+
# The second option is when we have already marked
|
|
828
|
+
# the end of the iteration in self._finished and
|
|
829
|
+
# are now seeing a previous iteration.
|
|
830
|
+
# We just mark the task as finished but we don't
|
|
831
|
+
# put it in the finished_tasks list to pass to
|
|
832
|
+
# the _queue_tasks function
|
|
833
|
+
self._finished[task.finished_id] = task.path
|
|
834
|
+
self._is_cloned[task.path] = task.is_cloned
|
|
835
|
+
else:
|
|
836
|
+
raise MetaflowInternalError(
|
|
837
|
+
"Unexpected recursive cloned tasks -- "
|
|
838
|
+
"this is a bug, please report it."
|
|
839
|
+
)
|
|
840
|
+
else:
|
|
841
|
+
# New entry
|
|
842
|
+
finished_tasks.append(task)
|
|
843
|
+
idx_per_finished_id[(step_name, foreach_stack)] = (
|
|
844
|
+
len(finished_tasks) - 1,
|
|
845
|
+
iteration_stack,
|
|
846
|
+
)
|
|
346
847
|
|
|
347
|
-
|
|
348
|
-
|
|
848
|
+
# reset the list of cloned tasks and let poll_workers handle
|
|
849
|
+
# the remaining transition
|
|
850
|
+
self._cloned_tasks = []
|
|
349
851
|
else:
|
|
350
|
-
|
|
351
|
-
|
|
852
|
+
finished_tasks = list(self._poll_workers())
|
|
853
|
+
# 2. push new tasks triggered by the finished tasks to the queue
|
|
854
|
+
self._queue_tasks(finished_tasks)
|
|
855
|
+
# 3. if there are available worker slots, pop and start tasks
|
|
856
|
+
# from the queue.
|
|
857
|
+
self._launch_workers()
|
|
858
|
+
|
|
859
|
+
if time.time() - progress_tstamp > PROGRESS_INTERVAL:
|
|
860
|
+
progress_tstamp = time.time()
|
|
861
|
+
tasks_print = ", ".join(
|
|
862
|
+
[
|
|
863
|
+
"%s (%d running; %d done)" % (k, v[0], v[1])
|
|
864
|
+
for k, v in self._active_tasks.items()
|
|
865
|
+
if k != 0 and v[0] > 0
|
|
866
|
+
]
|
|
867
|
+
)
|
|
868
|
+
if self._active_tasks[0] == 0:
|
|
869
|
+
msg = "No tasks are running."
|
|
352
870
|
else:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
871
|
+
if self._active_tasks[0] == 1:
|
|
872
|
+
msg = "1 task is running: "
|
|
873
|
+
else:
|
|
874
|
+
msg = "%d tasks are running: " % self._active_tasks[0]
|
|
875
|
+
msg += "%s." % tasks_print
|
|
356
876
|
|
|
357
|
-
self._logger(msg, system_msg=True)
|
|
358
|
-
if len(self._unprocessed_steps) > 0:
|
|
359
|
-
if len(self._unprocessed_steps) == 1:
|
|
360
|
-
msg = "%s step has not started" % (
|
|
361
|
-
next(iter(self._unprocessed_steps)),
|
|
362
|
-
)
|
|
363
|
-
else:
|
|
364
|
-
msg = "%d steps have not started: " % len(
|
|
365
|
-
self._unprocessed_steps
|
|
366
|
-
)
|
|
367
|
-
msg += "%s." % ", ".join(self._unprocessed_steps)
|
|
368
877
|
self._logger(msg, system_msg=True)
|
|
369
878
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
raise
|
|
380
|
-
finally:
|
|
381
|
-
# on finish clean tasks
|
|
382
|
-
for step in self._flow:
|
|
383
|
-
for deco in step.decorators:
|
|
384
|
-
deco.runtime_finished(exception)
|
|
879
|
+
if len(self._run_queue) == 0:
|
|
880
|
+
msg = "No tasks are waiting in the queue."
|
|
881
|
+
else:
|
|
882
|
+
if len(self._run_queue) == 1:
|
|
883
|
+
msg = "1 task is waiting in the queue: "
|
|
884
|
+
else:
|
|
885
|
+
msg = "%d tasks are waiting in the queue." % len(
|
|
886
|
+
self._run_queue
|
|
887
|
+
)
|
|
385
888
|
|
|
386
|
-
|
|
889
|
+
self._logger(msg, system_msg=True)
|
|
890
|
+
if len(self._unprocessed_steps) > 0:
|
|
891
|
+
if len(self._unprocessed_steps) == 1:
|
|
892
|
+
msg = "%s step has not started" % (
|
|
893
|
+
next(iter(self._unprocessed_steps)),
|
|
894
|
+
)
|
|
895
|
+
else:
|
|
896
|
+
msg = "%d steps have not started: " % len(
|
|
897
|
+
self._unprocessed_steps
|
|
898
|
+
)
|
|
899
|
+
msg += "%s." % ", ".join(self._unprocessed_steps)
|
|
900
|
+
self._logger(msg, system_msg=True)
|
|
901
|
+
|
|
902
|
+
except KeyboardInterrupt as ex:
|
|
903
|
+
self._logger("Workflow interrupted.", system_msg=True, bad=True)
|
|
904
|
+
self._killall()
|
|
905
|
+
exception = ex
|
|
906
|
+
raise
|
|
907
|
+
except Exception as ex:
|
|
908
|
+
self._logger("Workflow failed.", system_msg=True, bad=True)
|
|
909
|
+
self._killall()
|
|
910
|
+
exception = ex
|
|
911
|
+
raise
|
|
912
|
+
finally:
|
|
913
|
+
# on finish clean tasks
|
|
914
|
+
if not self._skip_decorator_hooks:
|
|
915
|
+
for step in self._flow:
|
|
916
|
+
for deco in step.decorators:
|
|
917
|
+
deco.runtime_finished(exception)
|
|
918
|
+
self._run_exit_hooks()
|
|
387
919
|
|
|
388
920
|
# assert that end was executed and it was successful
|
|
389
|
-
if ("end", ()) in self._finished:
|
|
921
|
+
if ("end", (), ()) in self._finished:
|
|
390
922
|
if self._run_url:
|
|
391
923
|
self._logger(
|
|
392
924
|
"Done! See the run in the UI at %s" % self._run_url,
|
|
@@ -406,6 +938,51 @@ class NativeRuntime(object):
|
|
|
406
938
|
"The *end* step was not successful by the end of flow."
|
|
407
939
|
)
|
|
408
940
|
|
|
941
|
+
def _run_exit_hooks(self):
|
|
942
|
+
try:
|
|
943
|
+
flow_decos = self._flow._flow_state[FlowStateItems.FLOW_DECORATORS]
|
|
944
|
+
exit_hook_decos = flow_decos.get("exit_hook", [])
|
|
945
|
+
if not exit_hook_decos:
|
|
946
|
+
return
|
|
947
|
+
|
|
948
|
+
successful = ("end", (), ()) in self._finished or self._clone_only
|
|
949
|
+
pathspec = f"{self._graph.name}/{self._run_id}"
|
|
950
|
+
flow_file = self._environment.get_environment_info()["script"]
|
|
951
|
+
|
|
952
|
+
def _call(fn_name):
|
|
953
|
+
try:
|
|
954
|
+
result = (
|
|
955
|
+
subprocess.check_output(
|
|
956
|
+
args=[
|
|
957
|
+
sys.executable,
|
|
958
|
+
"-m",
|
|
959
|
+
"metaflow.plugins.exit_hook.exit_hook_script",
|
|
960
|
+
flow_file,
|
|
961
|
+
fn_name,
|
|
962
|
+
pathspec,
|
|
963
|
+
],
|
|
964
|
+
env=os.environ,
|
|
965
|
+
)
|
|
966
|
+
.decode()
|
|
967
|
+
.strip()
|
|
968
|
+
)
|
|
969
|
+
print(result)
|
|
970
|
+
except subprocess.CalledProcessError as e:
|
|
971
|
+
print(f"[exit_hook] Hook '{fn_name}' failed with error: {e}")
|
|
972
|
+
except Exception as e:
|
|
973
|
+
print(f"[exit_hook] Unexpected error in hook '{fn_name}': {e}")
|
|
974
|
+
|
|
975
|
+
# Call all exit hook functions regardless of individual failures
|
|
976
|
+
for fn_name in [
|
|
977
|
+
name
|
|
978
|
+
for deco in exit_hook_decos
|
|
979
|
+
for name in (deco.success_hooks if successful else deco.error_hooks)
|
|
980
|
+
]:
|
|
981
|
+
_call(fn_name)
|
|
982
|
+
|
|
983
|
+
except Exception as ex:
|
|
984
|
+
pass # do not fail due to exit hooks for whatever reason.
|
|
985
|
+
|
|
409
986
|
def _killall(self):
|
|
410
987
|
# If we are here, all children have received a signal and are shutting down.
|
|
411
988
|
# We want to give them an opportunity to do so and then kill
|
|
@@ -434,9 +1011,88 @@ class NativeRuntime(object):
|
|
|
434
1011
|
for _ in range(3):
|
|
435
1012
|
list(self._poll_workers())
|
|
436
1013
|
|
|
1014
|
+
# Given the current task information (task_index), the type of transition,
|
|
1015
|
+
# and the split index, return the new task index.
|
|
1016
|
+
def _translate_index(
|
|
1017
|
+
self, task, next_step, type, split_index=None, loop_mode=LoopBehavior.NONE
|
|
1018
|
+
):
|
|
1019
|
+
match = re.match(r"^(.+)\[(.*)\]\[(.*)\]$", task.task_index)
|
|
1020
|
+
old_match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
|
|
1021
|
+
if match:
|
|
1022
|
+
_, foreach_index, iteration_index = match.groups()
|
|
1023
|
+
# Convert foreach_index to a list of integers
|
|
1024
|
+
if len(foreach_index) > 0:
|
|
1025
|
+
foreach_index = foreach_index.split(",")
|
|
1026
|
+
else:
|
|
1027
|
+
foreach_index = []
|
|
1028
|
+
# Ditto for iteration_index
|
|
1029
|
+
if len(iteration_index) > 0:
|
|
1030
|
+
iteration_index = iteration_index.split(",")
|
|
1031
|
+
else:
|
|
1032
|
+
iteration_index = []
|
|
1033
|
+
elif old_match:
|
|
1034
|
+
_, foreach_index = old_match.groups()
|
|
1035
|
+
# Convert foreach_index to a list of integers
|
|
1036
|
+
if len(foreach_index) > 0:
|
|
1037
|
+
foreach_index = foreach_index.split(",")
|
|
1038
|
+
else:
|
|
1039
|
+
foreach_index = []
|
|
1040
|
+
# Legacy case fallback. No iteration index exists for these runs.
|
|
1041
|
+
iteration_index = []
|
|
1042
|
+
else:
|
|
1043
|
+
raise ValueError(
|
|
1044
|
+
"Index not in the format of {run_id}/{step_name}[{foreach_index}][{iteration_index}]"
|
|
1045
|
+
)
|
|
1046
|
+
if loop_mode == LoopBehavior.NONE:
|
|
1047
|
+
# Check if we are entering a looping construct. Right now, only recursive
|
|
1048
|
+
# steps are looping constructs
|
|
1049
|
+
next_step_node = self._graph[next_step]
|
|
1050
|
+
if (
|
|
1051
|
+
next_step_node.type == "split-switch"
|
|
1052
|
+
and next_step in next_step_node.out_funcs
|
|
1053
|
+
):
|
|
1054
|
+
loop_mode = LoopBehavior.ENTERING
|
|
1055
|
+
|
|
1056
|
+
# Update iteration_index
|
|
1057
|
+
if loop_mode == LoopBehavior.ENTERING:
|
|
1058
|
+
# We are entering a loop, so we add a new iteration level
|
|
1059
|
+
iteration_index.append("0")
|
|
1060
|
+
elif loop_mode == LoopBehavior.EXITING:
|
|
1061
|
+
iteration_index = iteration_index[:-1]
|
|
1062
|
+
elif loop_mode == LoopBehavior.LOOPING:
|
|
1063
|
+
if len(iteration_index) == 0:
|
|
1064
|
+
raise MetaflowInternalError(
|
|
1065
|
+
"In looping mode but there is no iteration index"
|
|
1066
|
+
)
|
|
1067
|
+
iteration_index[-1] = str(int(iteration_index[-1]) + 1)
|
|
1068
|
+
iteration_index = ",".join(iteration_index)
|
|
1069
|
+
|
|
1070
|
+
if type == "linear":
|
|
1071
|
+
return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
|
|
1072
|
+
elif type == "join":
|
|
1073
|
+
indices = []
|
|
1074
|
+
if len(foreach_index) > 0:
|
|
1075
|
+
indices = foreach_index[:-1]
|
|
1076
|
+
return "%s[%s][%s]" % (next_step, ",".join(indices), iteration_index)
|
|
1077
|
+
elif type == "split":
|
|
1078
|
+
foreach_index.append(str(split_index))
|
|
1079
|
+
return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
|
|
1080
|
+
|
|
437
1081
|
# Store the parameters needed for task creation, so that pushing on items
|
|
438
1082
|
# onto the run_queue is an inexpensive operation.
|
|
439
|
-
def _queue_push(self, step, task_kwargs):
|
|
1083
|
+
def _queue_push(self, step, task_kwargs, index=None):
|
|
1084
|
+
# In the case of cloning, we set all the cloned tasks as the
|
|
1085
|
+
# finished tasks when pushing tasks using _queue_tasks. This means that we
|
|
1086
|
+
# could potentially try to push the same task multiple times (for example
|
|
1087
|
+
# if multiple parents of a join are cloned). We therefore keep track of what
|
|
1088
|
+
# has executed (been cloned) or what has been scheduled and avoid scheduling
|
|
1089
|
+
# it again.
|
|
1090
|
+
if index:
|
|
1091
|
+
if index in self._ran_or_scheduled_task_index:
|
|
1092
|
+
# It has already run or been scheduled
|
|
1093
|
+
return
|
|
1094
|
+
# Note that we are scheduling this to run
|
|
1095
|
+
self._ran_or_scheduled_task_index.add(index)
|
|
440
1096
|
self._run_queue.insert(0, (step, task_kwargs))
|
|
441
1097
|
# For foreaches, this will happen multiple time but is ok, becomes a no-op
|
|
442
1098
|
self._unprocessed_steps.discard(step)
|
|
@@ -495,34 +1151,28 @@ class NativeRuntime(object):
|
|
|
495
1151
|
)
|
|
496
1152
|
num_splits = len(mapper_tasks)
|
|
497
1153
|
self._control_num_splits[task.path] = num_splits
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
"split_index": str(i),
|
|
509
|
-
"ubf_context": UBF_TASK,
|
|
510
|
-
},
|
|
511
|
-
)
|
|
512
|
-
else:
|
|
513
|
-
# Update _finished since these tasks were successfully
|
|
514
|
-
# run elsewhere so that join will be unblocked.
|
|
515
|
-
_, foreach_stack = task.finished_id
|
|
1154
|
+
|
|
1155
|
+
# If the control task is cloned, all mapper tasks should have been cloned
|
|
1156
|
+
# as well, so we no longer need to handle cloning of mapper tasks in runtime.
|
|
1157
|
+
|
|
1158
|
+
# Update _finished if we are not cloned. If we were cloned, we already
|
|
1159
|
+
# updated _finished with the new tasks. Note that the *value* of mapper
|
|
1160
|
+
# tasks is incorrect and contains the pathspec of the *cloned* run
|
|
1161
|
+
# but we don't use it for anything. We could look to clean it up though
|
|
1162
|
+
if not task.is_cloned:
|
|
1163
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
516
1164
|
top = foreach_stack[-1]
|
|
517
1165
|
bottom = list(foreach_stack[:-1])
|
|
518
1166
|
for i in range(num_splits):
|
|
519
1167
|
s = tuple(bottom + [top._replace(index=i)])
|
|
520
|
-
self._finished[(task.step, s)] = mapper_tasks[
|
|
1168
|
+
self._finished[(task.step, s, iteration_stack)] = mapper_tasks[
|
|
1169
|
+
i
|
|
1170
|
+
]
|
|
521
1171
|
self._is_cloned[mapper_tasks[i]] = False
|
|
522
1172
|
|
|
523
1173
|
# Find and check status of control task and retrieve its pathspec
|
|
524
1174
|
# for retrieving unbounded foreach cardinality.
|
|
525
|
-
_, foreach_stack = task.finished_id
|
|
1175
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
526
1176
|
top = foreach_stack[-1]
|
|
527
1177
|
bottom = list(foreach_stack[:-1])
|
|
528
1178
|
s = tuple(bottom + [top._replace(index=None)])
|
|
@@ -531,7 +1181,7 @@ class NativeRuntime(object):
|
|
|
531
1181
|
# it will have index=0 instead of index=None.
|
|
532
1182
|
if task.results.get("_control_task_is_mapper_zero", False):
|
|
533
1183
|
s = tuple(bottom + [top._replace(index=0)])
|
|
534
|
-
control_path = self._finished.get((task.step, s))
|
|
1184
|
+
control_path = self._finished.get((task.step, s, iteration_stack))
|
|
535
1185
|
if control_path:
|
|
536
1186
|
# Control task was successful.
|
|
537
1187
|
# Additionally check the state of (sibling) mapper tasks as well
|
|
@@ -540,21 +1190,27 @@ class NativeRuntime(object):
|
|
|
540
1190
|
required_tasks = []
|
|
541
1191
|
for i in range(num_splits):
|
|
542
1192
|
s = tuple(bottom + [top._replace(index=i)])
|
|
543
|
-
required_tasks.append(
|
|
1193
|
+
required_tasks.append(
|
|
1194
|
+
self._finished.get((task.step, s, iteration_stack))
|
|
1195
|
+
)
|
|
544
1196
|
|
|
545
1197
|
if all(required_tasks):
|
|
1198
|
+
index = self._translate_index(task, next_step, "join")
|
|
546
1199
|
# all tasks to be joined are ready. Schedule the next join step.
|
|
547
1200
|
self._queue_push(
|
|
548
1201
|
next_step,
|
|
549
1202
|
{"input_paths": required_tasks, "join_type": "foreach"},
|
|
1203
|
+
index,
|
|
550
1204
|
)
|
|
551
1205
|
else:
|
|
552
1206
|
# matching_split is the split-parent of the finished task
|
|
553
1207
|
matching_split = self._graph[self._graph[next_step].split_parents[-1]]
|
|
554
|
-
_, foreach_stack = task.finished_id
|
|
1208
|
+
_, foreach_stack, iteration_stack = task.finished_id
|
|
555
1209
|
|
|
1210
|
+
direct_parents = set(self._graph[next_step].in_funcs)
|
|
1211
|
+
|
|
1212
|
+
# next step is a foreach join
|
|
556
1213
|
if matching_split.type == "foreach":
|
|
557
|
-
# next step is a foreach join
|
|
558
1214
|
|
|
559
1215
|
def siblings(foreach_stack):
|
|
560
1216
|
top = foreach_stack[-1]
|
|
@@ -563,27 +1219,57 @@ class NativeRuntime(object):
|
|
|
563
1219
|
yield tuple(bottom + [top._replace(index=index)])
|
|
564
1220
|
|
|
565
1221
|
# required tasks are all split-siblings of the finished task
|
|
566
|
-
required_tasks =
|
|
567
|
-
|
|
568
|
-
|
|
1222
|
+
required_tasks = list(
|
|
1223
|
+
filter(
|
|
1224
|
+
lambda x: x is not None,
|
|
1225
|
+
[
|
|
1226
|
+
self._finished.get((p, s, iteration_stack))
|
|
1227
|
+
for p in direct_parents
|
|
1228
|
+
for s in siblings(foreach_stack)
|
|
1229
|
+
],
|
|
1230
|
+
)
|
|
1231
|
+
)
|
|
1232
|
+
required_count = task.finished_id[1][-1].num_splits
|
|
569
1233
|
join_type = "foreach"
|
|
1234
|
+
index = self._translate_index(task, next_step, "join")
|
|
570
1235
|
else:
|
|
571
1236
|
# next step is a split
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
1237
|
+
required_tasks = list(
|
|
1238
|
+
filter(
|
|
1239
|
+
lambda x: x is not None,
|
|
1240
|
+
[
|
|
1241
|
+
self._finished.get((p, foreach_stack, iteration_stack))
|
|
1242
|
+
for p in direct_parents
|
|
1243
|
+
],
|
|
1244
|
+
)
|
|
1245
|
+
)
|
|
578
1246
|
|
|
579
|
-
|
|
580
|
-
|
|
1247
|
+
required_count = len(matching_split.out_funcs)
|
|
1248
|
+
join_type = "linear"
|
|
1249
|
+
index = self._translate_index(task, next_step, "linear")
|
|
1250
|
+
if len(required_tasks) == required_count:
|
|
1251
|
+
# We have all the required previous tasks to schedule a join
|
|
581
1252
|
self._queue_push(
|
|
582
|
-
next_step,
|
|
1253
|
+
next_step,
|
|
1254
|
+
{"input_paths": required_tasks, "join_type": join_type},
|
|
1255
|
+
index,
|
|
583
1256
|
)
|
|
584
1257
|
|
|
585
|
-
def
|
|
1258
|
+
def _queue_task_switch(self, task, next_steps, is_recursive):
|
|
1259
|
+
chosen_step = next_steps[0]
|
|
1260
|
+
|
|
1261
|
+
loop_mode = LoopBehavior.NONE
|
|
1262
|
+
if is_recursive:
|
|
1263
|
+
if chosen_step != task.step:
|
|
1264
|
+
# We are exiting a loop
|
|
1265
|
+
loop_mode = LoopBehavior.EXITING
|
|
1266
|
+
else:
|
|
1267
|
+
# We are staying in the loop
|
|
1268
|
+
loop_mode = LoopBehavior.LOOPING
|
|
1269
|
+
index = self._translate_index(task, chosen_step, "linear", None, loop_mode)
|
|
1270
|
+
self._queue_push(chosen_step, {"input_paths": [task.path]}, index)
|
|
586
1271
|
|
|
1272
|
+
def _queue_task_foreach(self, task, next_steps):
|
|
587
1273
|
# CHECK: this condition should be enforced by the linter but
|
|
588
1274
|
# let's assert that the assumption holds
|
|
589
1275
|
if len(next_steps) > 1:
|
|
@@ -601,6 +1287,12 @@ class NativeRuntime(object):
|
|
|
601
1287
|
# Need to push control process related task.
|
|
602
1288
|
ubf_iter_name = task.results.get("_foreach_var")
|
|
603
1289
|
ubf_iter = task.results.get(ubf_iter_name)
|
|
1290
|
+
# UBF control task has no split index, hence "None" as place holder.
|
|
1291
|
+
|
|
1292
|
+
if task.results.get("_control_task_is_mapper_zero", False):
|
|
1293
|
+
index = self._translate_index(task, next_step, "split", 0)
|
|
1294
|
+
else:
|
|
1295
|
+
index = self._translate_index(task, next_step, "split", None)
|
|
604
1296
|
self._queue_push(
|
|
605
1297
|
next_step,
|
|
606
1298
|
{
|
|
@@ -608,6 +1300,7 @@ class NativeRuntime(object):
|
|
|
608
1300
|
"ubf_context": UBF_CONTROL,
|
|
609
1301
|
"ubf_iter": ubf_iter,
|
|
610
1302
|
},
|
|
1303
|
+
index,
|
|
611
1304
|
)
|
|
612
1305
|
else:
|
|
613
1306
|
num_splits = task.results["_foreach_num_splits"]
|
|
@@ -627,8 +1320,11 @@ class NativeRuntime(object):
|
|
|
627
1320
|
|
|
628
1321
|
# schedule all splits
|
|
629
1322
|
for i in range(num_splits):
|
|
1323
|
+
index = self._translate_index(task, next_step, "split", i)
|
|
630
1324
|
self._queue_push(
|
|
631
|
-
next_step,
|
|
1325
|
+
next_step,
|
|
1326
|
+
{"split_index": str(i), "input_paths": [task.path]},
|
|
1327
|
+
index,
|
|
632
1328
|
)
|
|
633
1329
|
|
|
634
1330
|
def _queue_tasks(self, finished_tasks):
|
|
@@ -649,7 +1345,39 @@ class NativeRuntime(object):
|
|
|
649
1345
|
next_steps = []
|
|
650
1346
|
foreach = None
|
|
651
1347
|
expected = self._graph[task.step].out_funcs
|
|
652
|
-
|
|
1348
|
+
|
|
1349
|
+
if self._graph[task.step].type == "split-switch":
|
|
1350
|
+
is_recursive = task.step in self._graph[task.step].out_funcs
|
|
1351
|
+
if len(next_steps) != 1:
|
|
1352
|
+
msg = (
|
|
1353
|
+
"Switch step *{step}* should transition to exactly "
|
|
1354
|
+
"one step at runtime, but got: {actual}"
|
|
1355
|
+
)
|
|
1356
|
+
raise MetaflowInternalError(
|
|
1357
|
+
msg.format(step=task.step, actual=", ".join(next_steps))
|
|
1358
|
+
)
|
|
1359
|
+
if next_steps[0] not in expected:
|
|
1360
|
+
msg = (
|
|
1361
|
+
"Switch step *{step}* transitioned to unexpected "
|
|
1362
|
+
"step *{actual}*. Expected one of: {expected}"
|
|
1363
|
+
)
|
|
1364
|
+
raise MetaflowInternalError(
|
|
1365
|
+
msg.format(
|
|
1366
|
+
step=task.step,
|
|
1367
|
+
actual=next_steps[0],
|
|
1368
|
+
expected=", ".join(expected),
|
|
1369
|
+
)
|
|
1370
|
+
)
|
|
1371
|
+
# When exiting a recursive loop, we mark that the loop itself has
|
|
1372
|
+
# finished by adding a special entry in self._finished which has
|
|
1373
|
+
# an iteration stack that is shorter (ie: we are out of the loop) so
|
|
1374
|
+
# that we can then find it when looking at successor tasks to launch.
|
|
1375
|
+
if is_recursive and next_steps[0] != task.step:
|
|
1376
|
+
step_name, finished_tuple, iteration_tuple = task.finished_id
|
|
1377
|
+
self._finished[
|
|
1378
|
+
(step_name, finished_tuple, iteration_tuple[:-1])
|
|
1379
|
+
] = task.path
|
|
1380
|
+
elif next_steps != expected:
|
|
653
1381
|
msg = (
|
|
654
1382
|
"Based on static analysis of the code, step *{step}* "
|
|
655
1383
|
"was expected to transition to step(s) *{expected}*. "
|
|
@@ -673,10 +1401,14 @@ class NativeRuntime(object):
|
|
|
673
1401
|
elif foreach:
|
|
674
1402
|
# Next step is a foreach child
|
|
675
1403
|
self._queue_task_foreach(task, next_steps)
|
|
1404
|
+
elif self._graph[task.step].type == "split-switch":
|
|
1405
|
+
# Current step is switch - queue the chosen step
|
|
1406
|
+
self._queue_task_switch(task, next_steps, is_recursive)
|
|
676
1407
|
else:
|
|
677
1408
|
# Next steps are normal linear steps
|
|
678
1409
|
for step in next_steps:
|
|
679
|
-
self.
|
|
1410
|
+
index = self._translate_index(task, step, "linear")
|
|
1411
|
+
self._queue_push(step, {"input_paths": [task.path]}, index)
|
|
680
1412
|
|
|
681
1413
|
def _poll_workers(self):
|
|
682
1414
|
if self._workers:
|
|
@@ -728,6 +1460,22 @@ class NativeRuntime(object):
|
|
|
728
1460
|
# Initialize the task (which can be expensive using remote datastores)
|
|
729
1461
|
# before launching the worker so that cost is amortized over time, instead
|
|
730
1462
|
# of doing it during _queue_push.
|
|
1463
|
+
if (
|
|
1464
|
+
FEAT_ALWAYS_UPLOAD_CODE_PACKAGE
|
|
1465
|
+
and "METAFLOW_CODE_SHA" not in os.environ
|
|
1466
|
+
):
|
|
1467
|
+
# We check if the code package is uploaded and, if so, we set the
|
|
1468
|
+
# environment variables that will cause the metadata service to
|
|
1469
|
+
# register the code package with the task created in _new_task below
|
|
1470
|
+
code_sha = self._package.package_sha(timeout=0.01)
|
|
1471
|
+
if code_sha:
|
|
1472
|
+
os.environ["METAFLOW_CODE_SHA"] = code_sha
|
|
1473
|
+
os.environ["METAFLOW_CODE_URL"] = self._package.package_url()
|
|
1474
|
+
os.environ["METAFLOW_CODE_DS"] = self._flow_datastore.TYPE
|
|
1475
|
+
os.environ["METAFLOW_CODE_METADATA"] = (
|
|
1476
|
+
self._package.package_metadata
|
|
1477
|
+
)
|
|
1478
|
+
|
|
731
1479
|
task = self._new_task(step, **task_kwargs)
|
|
732
1480
|
self._launch_worker(task)
|
|
733
1481
|
|
|
@@ -755,7 +1503,7 @@ class NativeRuntime(object):
|
|
|
755
1503
|
)
|
|
756
1504
|
return
|
|
757
1505
|
|
|
758
|
-
worker = Worker(task, self._max_log_size)
|
|
1506
|
+
worker = Worker(task, self._max_log_size, self._config_file_name)
|
|
759
1507
|
for fd in worker.fds():
|
|
760
1508
|
self._workers[fd] = worker
|
|
761
1509
|
self._poll.add(fd)
|
|
@@ -797,9 +1545,10 @@ class Task(object):
|
|
|
797
1545
|
join_type=None,
|
|
798
1546
|
task_id=None,
|
|
799
1547
|
resume_identifier=None,
|
|
1548
|
+
pathspec_index=None,
|
|
800
1549
|
):
|
|
801
|
-
|
|
802
1550
|
self.step = step
|
|
1551
|
+
self.flow = flow
|
|
803
1552
|
self.flow_name = flow.name
|
|
804
1553
|
self.run_id = run_id
|
|
805
1554
|
self.task_id = None
|
|
@@ -839,10 +1588,9 @@ class Task(object):
|
|
|
839
1588
|
self._is_resume_leader = None
|
|
840
1589
|
self._resume_done = None
|
|
841
1590
|
self._resume_identifier = resume_identifier
|
|
842
|
-
|
|
843
1591
|
origin = None
|
|
844
1592
|
if clone_run_id and may_clone:
|
|
845
|
-
origin = self._find_origin_task(clone_run_id, join_type)
|
|
1593
|
+
origin = self._find_origin_task(clone_run_id, join_type, pathspec_index)
|
|
846
1594
|
if origin and origin["_task_ok"]:
|
|
847
1595
|
# At this point, we know we are going to clone
|
|
848
1596
|
self._is_cloned = True
|
|
@@ -934,8 +1682,7 @@ class Task(object):
|
|
|
934
1682
|
# To avoid the edge case where the resume leader is selected but has not
|
|
935
1683
|
# yet written the _resume_leader metadata, we will wait for a few seconds.
|
|
936
1684
|
# We will wait for resume leader for at most 3 times.
|
|
937
|
-
for
|
|
938
|
-
|
|
1685
|
+
for _ in range(3):
|
|
939
1686
|
if ds.has_metadata("_resume_leader", add_attempt=False):
|
|
940
1687
|
resume_leader = ds.load_metadata(
|
|
941
1688
|
["_resume_leader"], add_attempt=False
|
|
@@ -964,10 +1711,11 @@ class Task(object):
|
|
|
964
1711
|
)
|
|
965
1712
|
|
|
966
1713
|
if self._is_resume_leader:
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1714
|
+
if reentrant:
|
|
1715
|
+
self.log(
|
|
1716
|
+
"Selected as the reentrant clone leader.",
|
|
1717
|
+
system_msg=True,
|
|
1718
|
+
)
|
|
971
1719
|
# Clone in place without relying on run_queue.
|
|
972
1720
|
self.new_attempt()
|
|
973
1721
|
self._ds.clone(origin)
|
|
@@ -1016,13 +1764,13 @@ class Task(object):
|
|
|
1016
1764
|
self._should_skip_cloning = task_completed
|
|
1017
1765
|
if self._should_skip_cloning:
|
|
1018
1766
|
self.log(
|
|
1019
|
-
"
|
|
1767
|
+
"Skipping cloning of previously run task %s"
|
|
1768
|
+
% self.clone_origin,
|
|
1020
1769
|
system_msg=True,
|
|
1021
1770
|
)
|
|
1022
1771
|
else:
|
|
1023
1772
|
self.log(
|
|
1024
|
-
"Cloning
|
|
1025
|
-
% self.clone_origin,
|
|
1773
|
+
"Cloning previously run task %s" % self.clone_origin,
|
|
1026
1774
|
system_msg=True,
|
|
1027
1775
|
)
|
|
1028
1776
|
else:
|
|
@@ -1035,7 +1783,6 @@ class Task(object):
|
|
|
1035
1783
|
# Open the output datastore only if the task is not being cloned.
|
|
1036
1784
|
if not self._is_cloned:
|
|
1037
1785
|
self.new_attempt()
|
|
1038
|
-
|
|
1039
1786
|
for deco in decos:
|
|
1040
1787
|
deco.runtime_task_created(
|
|
1041
1788
|
self._ds,
|
|
@@ -1112,63 +1859,34 @@ class Task(object):
|
|
|
1112
1859
|
|
|
1113
1860
|
def _get_task_id(self, task_id):
|
|
1114
1861
|
already_existed = True
|
|
1862
|
+
tags = []
|
|
1115
1863
|
if self.ubf_context == UBF_CONTROL:
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
# We associate the control task-id to be 1:1 with the split node
|
|
1119
|
-
# where the unbounded-foreach was defined.
|
|
1120
|
-
# We prefer encoding the corresponding split into the task_id of
|
|
1121
|
-
# the control node; so it has access to this information quite
|
|
1122
|
-
# easily. There is anyway a corresponding int id stored in the
|
|
1123
|
-
# metadata backend - so this should be fine.
|
|
1124
|
-
task_id = "control-%s-%s-%s" % (run, input_step, input_task)
|
|
1125
|
-
# Register only regular Metaflow (non control) tasks.
|
|
1864
|
+
tags = [CONTROL_TASK_TAG]
|
|
1865
|
+
# Register Metaflow tasks.
|
|
1126
1866
|
if task_id is None:
|
|
1127
|
-
task_id = str(
|
|
1867
|
+
task_id = str(
|
|
1868
|
+
self.metadata.new_task_id(self.run_id, self.step, sys_tags=tags)
|
|
1869
|
+
)
|
|
1128
1870
|
already_existed = False
|
|
1129
1871
|
else:
|
|
1130
|
-
# task_id is preset only by persist_constants()
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
attempt_id,
|
|
1139
|
-
sys_tags=tags,
|
|
1140
|
-
)
|
|
1141
|
-
# A Task's tags are now those of its ancestral Run, so we are not able
|
|
1142
|
-
# to rely on a task's tags to indicate the presence of a control task
|
|
1143
|
-
# so, on top of adding the tags above, we also add a task metadata
|
|
1144
|
-
# entry indicating that this is a "control task".
|
|
1145
|
-
#
|
|
1146
|
-
# Here we will also add a task metadata entry to indicate "control task".
|
|
1147
|
-
# Within the metaflow repo, the only dependency of such a "control task"
|
|
1148
|
-
# indicator is in the integration test suite (see Step.control_tasks() in
|
|
1149
|
-
# client API).
|
|
1150
|
-
task_metadata_list = [
|
|
1151
|
-
MetaDatum(
|
|
1152
|
-
field="internal_task_type",
|
|
1153
|
-
value=CONTROL_TASK_TAG,
|
|
1154
|
-
type="internal_task_type",
|
|
1155
|
-
tags=["attempt_id:{0}".format(attempt_id)],
|
|
1156
|
-
)
|
|
1157
|
-
]
|
|
1158
|
-
self.metadata.register_metadata(
|
|
1159
|
-
self.run_id, self.step, task_id, task_metadata_list
|
|
1160
|
-
)
|
|
1161
|
-
else:
|
|
1162
|
-
already_existed = not self.metadata.register_task_id(
|
|
1163
|
-
self.run_id, self.step, task_id, 0
|
|
1164
|
-
)
|
|
1872
|
+
# task_id is preset only by persist_constants().
|
|
1873
|
+
already_existed = not self.metadata.register_task_id(
|
|
1874
|
+
self.run_id,
|
|
1875
|
+
self.step,
|
|
1876
|
+
task_id,
|
|
1877
|
+
0,
|
|
1878
|
+
sys_tags=tags,
|
|
1879
|
+
)
|
|
1165
1880
|
|
|
1166
1881
|
self.task_id = task_id
|
|
1167
1882
|
self._path = "%s/%s/%s" % (self.run_id, self.step, self.task_id)
|
|
1168
1883
|
return already_existed
|
|
1169
1884
|
|
|
1170
|
-
def _find_origin_task(self, clone_run_id, join_type):
|
|
1171
|
-
if
|
|
1885
|
+
def _find_origin_task(self, clone_run_id, join_type, pathspec_index=None):
|
|
1886
|
+
if pathspec_index:
|
|
1887
|
+
origin = self.origin_ds_set.get_with_pathspec_index(pathspec_index)
|
|
1888
|
+
return origin
|
|
1889
|
+
elif self.step == "_parameters":
|
|
1172
1890
|
pathspec = "%s/_parameters[]" % clone_run_id
|
|
1173
1891
|
origin = self.origin_ds_set.get_with_pathspec_index(pathspec)
|
|
1174
1892
|
|
|
@@ -1218,16 +1936,23 @@ class Task(object):
|
|
|
1218
1936
|
)
|
|
1219
1937
|
return self._results_ds
|
|
1220
1938
|
|
|
1939
|
+
@property
|
|
1940
|
+
def task_index(self):
|
|
1941
|
+
_, task_index = self.results.pathspec_index.split("/")
|
|
1942
|
+
return task_index
|
|
1943
|
+
|
|
1221
1944
|
@property
|
|
1222
1945
|
def finished_id(self):
|
|
1223
1946
|
# note: id is not available before the task has finished.
|
|
1224
|
-
# Index already identifies the task within the foreach
|
|
1225
|
-
#
|
|
1947
|
+
# Index already identifies the task within the foreach and loop.
|
|
1948
|
+
# We will remove foreach value so that it is easier to
|
|
1226
1949
|
# identify siblings within a foreach.
|
|
1227
1950
|
foreach_stack_tuple = tuple(
|
|
1228
1951
|
[s._replace(value=0) for s in self.results["_foreach_stack"]]
|
|
1229
1952
|
)
|
|
1230
|
-
|
|
1953
|
+
# _iteration_stack requires a fallback, as it does not exist for runs before v2.17.4
|
|
1954
|
+
iteration_stack_tuple = tuple(self.results.get("_iteration_stack", []))
|
|
1955
|
+
return (self.step, foreach_stack_tuple, iteration_stack_tuple)
|
|
1231
1956
|
|
|
1232
1957
|
@property
|
|
1233
1958
|
def is_cloned(self):
|
|
@@ -1301,9 +2026,29 @@ class CLIArgs(object):
|
|
|
1301
2026
|
for step execution in StepDecorator.runtime_step_cli().
|
|
1302
2027
|
"""
|
|
1303
2028
|
|
|
1304
|
-
def __init__(
|
|
2029
|
+
def __init__(
|
|
2030
|
+
self,
|
|
2031
|
+
task,
|
|
2032
|
+
orig_flow_datastore=None,
|
|
2033
|
+
spin_pathspec=None,
|
|
2034
|
+
artifacts_module=None,
|
|
2035
|
+
persist=True,
|
|
2036
|
+
skip_decorators=False,
|
|
2037
|
+
):
|
|
1305
2038
|
self.task = task
|
|
2039
|
+
if orig_flow_datastore is not None:
|
|
2040
|
+
self.orig_flow_datastore = "%s@%s" % (
|
|
2041
|
+
orig_flow_datastore.TYPE,
|
|
2042
|
+
orig_flow_datastore.datastore_root,
|
|
2043
|
+
)
|
|
2044
|
+
else:
|
|
2045
|
+
self.orig_flow_datastore = None
|
|
2046
|
+
self.spin_pathspec = spin_pathspec
|
|
2047
|
+
self.artifacts_module = artifacts_module
|
|
2048
|
+
self.persist = persist
|
|
2049
|
+
self.skip_decorators = skip_decorators
|
|
1306
2050
|
self.entrypoint = list(task.entrypoint)
|
|
2051
|
+
step_obj = getattr(self.task.flow, self.task.step)
|
|
1307
2052
|
self.top_level_options = {
|
|
1308
2053
|
"quiet": True,
|
|
1309
2054
|
"metadata": self.task.metadata_type,
|
|
@@ -1315,38 +2060,77 @@ class CLIArgs(object):
|
|
|
1315
2060
|
"datastore-root": self.task.datastore_sysroot,
|
|
1316
2061
|
"with": [
|
|
1317
2062
|
deco.make_decorator_spec()
|
|
1318
|
-
for deco in
|
|
1319
|
-
|
|
2063
|
+
for deco in chain(
|
|
2064
|
+
self.task.decos,
|
|
2065
|
+
step_obj.wrappers,
|
|
2066
|
+
step_obj.config_decorators,
|
|
2067
|
+
)
|
|
2068
|
+
if not deco.statically_defined and deco.inserted_by is None
|
|
1320
2069
|
],
|
|
1321
2070
|
}
|
|
1322
2071
|
|
|
1323
2072
|
# FlowDecorators can define their own top-level options. They are
|
|
1324
2073
|
# responsible for adding their own top-level options and values through
|
|
1325
2074
|
# the get_top_level_options() hook.
|
|
1326
|
-
for deco in flow_decorators():
|
|
2075
|
+
for deco in flow_decorators(self.task.flow):
|
|
1327
2076
|
self.top_level_options.update(deco.get_top_level_options())
|
|
1328
2077
|
|
|
2078
|
+
# We also pass configuration options using the kv.<name> syntax which will cause
|
|
2079
|
+
# the configuration options to be loaded from the CONFIG file (or local-config-file
|
|
2080
|
+
# in the case of the local runtime)
|
|
2081
|
+
configs = self.task.flow._flow_state[FlowStateItems.CONFIGS]
|
|
2082
|
+
if configs:
|
|
2083
|
+
self.top_level_options["config-value"] = [
|
|
2084
|
+
(k, ConfigInput.make_key_name(k)) for k in configs
|
|
2085
|
+
]
|
|
2086
|
+
|
|
2087
|
+
if spin_pathspec:
|
|
2088
|
+
self.spin_args()
|
|
2089
|
+
else:
|
|
2090
|
+
self.default_args()
|
|
2091
|
+
|
|
2092
|
+
def default_args(self):
|
|
1329
2093
|
self.commands = ["step"]
|
|
1330
2094
|
self.command_args = [self.task.step]
|
|
1331
2095
|
self.command_options = {
|
|
1332
|
-
"run-id": task.run_id,
|
|
1333
|
-
"task-id": task.task_id,
|
|
1334
|
-
"input-paths": compress_list(task.input_paths),
|
|
1335
|
-
"split-index": task.split_index,
|
|
1336
|
-
"retry-count": task.retries,
|
|
1337
|
-
"max-user-code-retries": task.user_code_retries,
|
|
1338
|
-
"tag": task.tags,
|
|
2096
|
+
"run-id": self.task.run_id,
|
|
2097
|
+
"task-id": self.task.task_id,
|
|
2098
|
+
"input-paths": compress_list(self.task.input_paths),
|
|
2099
|
+
"split-index": self.task.split_index,
|
|
2100
|
+
"retry-count": self.task.retries,
|
|
2101
|
+
"max-user-code-retries": self.task.user_code_retries,
|
|
2102
|
+
"tag": self.task.tags,
|
|
1339
2103
|
"namespace": get_namespace() or "",
|
|
1340
|
-
"ubf-context": task.ubf_context,
|
|
2104
|
+
"ubf-context": self.task.ubf_context,
|
|
1341
2105
|
}
|
|
1342
2106
|
self.env = {}
|
|
1343
2107
|
|
|
1344
|
-
def
|
|
2108
|
+
def spin_args(self):
|
|
2109
|
+
self.commands = ["spin-step"]
|
|
2110
|
+
self.command_args = [self.task.step]
|
|
1345
2111
|
|
|
2112
|
+
self.command_options = {
|
|
2113
|
+
"run-id": self.task.run_id,
|
|
2114
|
+
"task-id": self.task.task_id,
|
|
2115
|
+
"input-paths": compress_list(self.task.input_paths),
|
|
2116
|
+
"split-index": self.task.split_index,
|
|
2117
|
+
"retry-count": self.task.retries,
|
|
2118
|
+
"max-user-code-retries": self.task.user_code_retries,
|
|
2119
|
+
"namespace": get_namespace() or "",
|
|
2120
|
+
"orig-flow-datastore": self.orig_flow_datastore,
|
|
2121
|
+
"artifacts-module": self.artifacts_module,
|
|
2122
|
+
"skip-decorators": self.skip_decorators,
|
|
2123
|
+
}
|
|
2124
|
+
if self.persist:
|
|
2125
|
+
self.command_options["persist"] = True
|
|
2126
|
+
else:
|
|
2127
|
+
self.command_options["no-persist"] = True
|
|
2128
|
+
self.env = {}
|
|
2129
|
+
|
|
2130
|
+
def get_args(self):
|
|
1346
2131
|
# TODO: Make one with dict_to_cli_options; see cli_args.py for more detail
|
|
1347
2132
|
def _options(mapping):
|
|
1348
2133
|
for k, v in mapping.items():
|
|
1349
|
-
|
|
1350
2134
|
# None or False arguments are ignored
|
|
1351
2135
|
# v needs to be explicitly False, not falsy, e.g. 0 is an acceptable value
|
|
1352
2136
|
if v is None or v is False:
|
|
@@ -1361,12 +2145,15 @@ class CLIArgs(object):
|
|
|
1361
2145
|
for value in v:
|
|
1362
2146
|
yield "--%s" % k
|
|
1363
2147
|
if not isinstance(value, bool):
|
|
1364
|
-
|
|
2148
|
+
value = value if isinstance(value, tuple) else (value,)
|
|
2149
|
+
for vv in value:
|
|
2150
|
+
yield to_unicode(vv)
|
|
1365
2151
|
|
|
1366
2152
|
args = list(self.entrypoint)
|
|
1367
2153
|
args.extend(_options(self.top_level_options))
|
|
1368
2154
|
args.extend(self.commands)
|
|
1369
2155
|
args.extend(self.command_args)
|
|
2156
|
+
|
|
1370
2157
|
args.extend(_options(self.command_options))
|
|
1371
2158
|
return args
|
|
1372
2159
|
|
|
@@ -1378,8 +2165,24 @@ class CLIArgs(object):
|
|
|
1378
2165
|
|
|
1379
2166
|
|
|
1380
2167
|
class Worker(object):
|
|
1381
|
-
def __init__(
|
|
2168
|
+
def __init__(
|
|
2169
|
+
self,
|
|
2170
|
+
task,
|
|
2171
|
+
max_logs_size,
|
|
2172
|
+
config_file_name,
|
|
2173
|
+
orig_flow_datastore=None,
|
|
2174
|
+
spin_pathspec=None,
|
|
2175
|
+
artifacts_module=None,
|
|
2176
|
+
persist=True,
|
|
2177
|
+
skip_decorators=False,
|
|
2178
|
+
):
|
|
1382
2179
|
self.task = task
|
|
2180
|
+
self._config_file_name = config_file_name
|
|
2181
|
+
self._orig_flow_datastore = orig_flow_datastore
|
|
2182
|
+
self._spin_pathspec = spin_pathspec
|
|
2183
|
+
self._artifacts_module = artifacts_module
|
|
2184
|
+
self._skip_decorators = skip_decorators
|
|
2185
|
+
self._persist = persist
|
|
1383
2186
|
self._proc = self._launch()
|
|
1384
2187
|
|
|
1385
2188
|
if task.retries > task.user_code_retries:
|
|
@@ -1411,7 +2214,14 @@ class Worker(object):
|
|
|
1411
2214
|
# not it is properly shut down)
|
|
1412
2215
|
|
|
1413
2216
|
def _launch(self):
|
|
1414
|
-
args = CLIArgs(
|
|
2217
|
+
args = CLIArgs(
|
|
2218
|
+
self.task,
|
|
2219
|
+
orig_flow_datastore=self._orig_flow_datastore,
|
|
2220
|
+
spin_pathspec=self._spin_pathspec,
|
|
2221
|
+
artifacts_module=self._artifacts_module,
|
|
2222
|
+
persist=self._persist,
|
|
2223
|
+
skip_decorators=self._skip_decorators,
|
|
2224
|
+
)
|
|
1415
2225
|
env = dict(os.environ)
|
|
1416
2226
|
|
|
1417
2227
|
if self.task.clone_run_id:
|
|
@@ -1431,6 +2241,12 @@ class Worker(object):
|
|
|
1431
2241
|
self.task.user_code_retries,
|
|
1432
2242
|
self.task.ubf_context,
|
|
1433
2243
|
)
|
|
2244
|
+
|
|
2245
|
+
# Add user configurations using a file to avoid using up too much space on the
|
|
2246
|
+
# command line
|
|
2247
|
+
if self._config_file_name:
|
|
2248
|
+
args.top_level_options["local-config-file"] = self._config_file_name
|
|
2249
|
+
# Pass configuration options
|
|
1434
2250
|
env.update(args.get_env())
|
|
1435
2251
|
env["PYTHONUNBUFFERED"] = "x"
|
|
1436
2252
|
tracing.inject_tracing_vars(env)
|
|
@@ -1438,6 +2254,7 @@ class Worker(object):
|
|
|
1438
2254
|
# by read_logline() below that relies on readline() not blocking
|
|
1439
2255
|
# print('running', args)
|
|
1440
2256
|
cmdline = args.get_args()
|
|
2257
|
+
from_start(f"Command line: {' '.join(cmdline)}")
|
|
1441
2258
|
debug.subcommand_exec(cmdline)
|
|
1442
2259
|
return subprocess.Popen(
|
|
1443
2260
|
cmdline,
|
|
@@ -1560,13 +2377,14 @@ class Worker(object):
|
|
|
1560
2377
|
else:
|
|
1561
2378
|
self.emit_log(b"Task failed.", self._stderr, system_msg=True)
|
|
1562
2379
|
else:
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
2380
|
+
if not self._spin_pathspec:
|
|
2381
|
+
num = self.task.results["_foreach_num_splits"]
|
|
2382
|
+
if num:
|
|
2383
|
+
self.task.log(
|
|
2384
|
+
"Foreach yields %d child steps." % num,
|
|
2385
|
+
system_msg=True,
|
|
2386
|
+
pid=self._proc.pid,
|
|
2387
|
+
)
|
|
1570
2388
|
self.task.log(
|
|
1571
2389
|
"Task finished successfully.", system_msg=True, pid=self._proc.pid
|
|
1572
2390
|
)
|