ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/R.py +10 -7
- metaflow/__init__.py +40 -25
- metaflow/_vendor/imghdr/__init__.py +186 -0
- metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/typeguard/__init__.py +48 -0
- metaflow/_vendor/typeguard/_checkers.py +1070 -0
- metaflow/_vendor/typeguard/_config.py +108 -0
- metaflow/_vendor/typeguard/_decorators.py +233 -0
- metaflow/_vendor/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/typeguard/_functions.py +308 -0
- metaflow/_vendor/typeguard/_importhook.py +213 -0
- metaflow/_vendor/typeguard/_memo.py +48 -0
- metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
- metaflow/_vendor/typeguard/_suppression.py +86 -0
- metaflow/_vendor/typeguard/_transformer.py +1229 -0
- metaflow/_vendor/typeguard/_union_transformer.py +55 -0
- metaflow/_vendor/typeguard/_utils.py +173 -0
- metaflow/_vendor/typeguard/py.typed +0 -0
- metaflow/_vendor/typing_extensions.py +3641 -0
- metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
- metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
- metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
- metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
- metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
- metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
- metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
- metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
- metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
- metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
- metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
- metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
- metaflow/_vendor/yaml/__init__.py +427 -0
- metaflow/_vendor/yaml/composer.py +139 -0
- metaflow/_vendor/yaml/constructor.py +748 -0
- metaflow/_vendor/yaml/cyaml.py +101 -0
- metaflow/_vendor/yaml/dumper.py +62 -0
- metaflow/_vendor/yaml/emitter.py +1137 -0
- metaflow/_vendor/yaml/error.py +75 -0
- metaflow/_vendor/yaml/events.py +86 -0
- metaflow/_vendor/yaml/loader.py +63 -0
- metaflow/_vendor/yaml/nodes.py +49 -0
- metaflow/_vendor/yaml/parser.py +589 -0
- metaflow/_vendor/yaml/reader.py +185 -0
- metaflow/_vendor/yaml/representer.py +389 -0
- metaflow/_vendor/yaml/resolver.py +227 -0
- metaflow/_vendor/yaml/scanner.py +1435 -0
- metaflow/_vendor/yaml/serializer.py +111 -0
- metaflow/_vendor/yaml/tokens.py +104 -0
- metaflow/cards.py +5 -0
- metaflow/cli.py +331 -785
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +52 -0
- metaflow/cli_components/run_cmds.py +546 -0
- metaflow/cli_components/step_cmd.py +334 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +467 -73
- metaflow/client/filecache.py +75 -35
- metaflow/clone_util.py +7 -1
- metaflow/cmd/code/__init__.py +231 -0
- metaflow/cmd/develop/stub_generator.py +756 -288
- metaflow/cmd/develop/stubs.py +12 -28
- metaflow/cmd/main_cli.py +6 -4
- metaflow/cmd/make_wrapper.py +78 -0
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +41 -10
- metaflow/datastore/datastore_set.py +11 -2
- metaflow/datastore/flow_datastore.py +156 -10
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +154 -39
- metaflow/debug.py +5 -0
- metaflow/decorators.py +404 -78
- metaflow/exception.py +8 -2
- metaflow/extension_support/__init__.py +527 -376
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/extension_support/plugins.py +49 -31
- metaflow/flowspec.py +482 -33
- metaflow/graph.py +210 -42
- metaflow/includefile.py +84 -40
- metaflow/lint.py +141 -22
- metaflow/meta_files.py +13 -0
- metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
- metaflow/{metadata → metadata_provider}/metadata.py +86 -1
- metaflow/metaflow_config.py +175 -28
- metaflow/metaflow_config_funcs.py +51 -3
- metaflow/metaflow_current.py +4 -10
- metaflow/metaflow_environment.py +139 -53
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_profile.py +18 -0
- metaflow/metaflow_version.py +150 -66
- metaflow/mflog/__init__.py +4 -3
- metaflow/mflog/save_logs.py +2 -2
- metaflow/multicore_utils.py +31 -14
- metaflow/package/__init__.py +673 -0
- metaflow/packaging_sys/__init__.py +880 -0
- metaflow/packaging_sys/backend.py +128 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +99 -0
- metaflow/packaging_sys/utils.py +54 -0
- metaflow/packaging_sys/v1.py +527 -0
- metaflow/parameters.py +149 -28
- metaflow/plugins/__init__.py +74 -5
- metaflow/plugins/airflow/airflow.py +40 -25
- metaflow/plugins/airflow/airflow_cli.py +22 -5
- metaflow/plugins/airflow/airflow_decorator.py +1 -1
- metaflow/plugins/airflow/airflow_utils.py +5 -3
- metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
- metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
- metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
- metaflow/plugins/argo/argo_client.py +78 -33
- metaflow/plugins/argo/argo_events.py +6 -6
- metaflow/plugins/argo/argo_workflows.py +2410 -527
- metaflow/plugins/argo/argo_workflows_cli.py +571 -121
- metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
- metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
- metaflow/plugins/argo/capture_error.py +73 -0
- metaflow/plugins/argo/conditional_input_paths.py +35 -0
- metaflow/plugins/argo/exit_hooks.py +209 -0
- metaflow/plugins/argo/jobset_input_paths.py +15 -0
- metaflow/plugins/argo/param_val.py +19 -0
- metaflow/plugins/aws/aws_client.py +10 -3
- metaflow/plugins/aws/aws_utils.py +55 -2
- metaflow/plugins/aws/batch/batch.py +72 -5
- metaflow/plugins/aws/batch/batch_cli.py +33 -10
- metaflow/plugins/aws/batch/batch_client.py +4 -3
- metaflow/plugins/aws/batch/batch_decorator.py +102 -35
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +65 -8
- metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
- metaflow/plugins/azure/azure_exceptions.py +1 -1
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
- metaflow/plugins/azure/azure_tail.py +1 -1
- metaflow/plugins/azure/includefile_support.py +2 -0
- metaflow/plugins/cards/card_cli.py +66 -30
- metaflow/plugins/cards/card_creator.py +25 -1
- metaflow/plugins/cards/card_datastore.py +21 -49
- metaflow/plugins/cards/card_decorator.py +132 -8
- metaflow/plugins/cards/card_modules/basic.py +112 -17
- metaflow/plugins/cards/card_modules/bundle.css +1 -1
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
- metaflow/plugins/cards/card_modules/components.py +665 -28
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
- metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
- metaflow/plugins/cards/card_modules/main.css +1 -0
- metaflow/plugins/cards/card_modules/main.js +68 -49
- metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
- metaflow/plugins/cards/card_modules/test_cards.py +26 -12
- metaflow/plugins/cards/card_server.py +39 -14
- metaflow/plugins/cards/component_serializer.py +2 -9
- metaflow/plugins/cards/metadata.py +22 -0
- metaflow/plugins/catch_decorator.py +9 -0
- metaflow/plugins/datastores/azure_storage.py +10 -1
- metaflow/plugins/datastores/gs_storage.py +6 -2
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/local.py +2 -0
- metaflow/plugins/datatools/s3/s3.py +126 -75
- metaflow/plugins/datatools/s3/s3op.py +254 -121
- metaflow/plugins/env_escape/__init__.py +3 -3
- metaflow/plugins/env_escape/client_modules.py +102 -72
- metaflow/plugins/env_escape/server.py +7 -0
- metaflow/plugins/env_escape/stub.py +24 -5
- metaflow/plugins/events_decorator.py +343 -185
- metaflow/plugins/exit_hook/__init__.py +0 -0
- metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
- metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/gcp/gs_tail.py +10 -6
- metaflow/plugins/gcp/includefile_support.py +3 -0
- metaflow/plugins/kubernetes/kube_utils.py +108 -0
- metaflow/plugins/kubernetes/kubernetes.py +411 -130
- metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
- metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
- metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
- metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/logs_cli.py +359 -0
- metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
- metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/plugins/package_cli.py +36 -24
- metaflow/plugins/parallel_decorator.py +128 -11
- metaflow/plugins/parsers.py +16 -0
- metaflow/plugins/project_decorator.py +51 -5
- metaflow/plugins/pypi/bootstrap.py +357 -105
- metaflow/plugins/pypi/conda_decorator.py +82 -81
- metaflow/plugins/pypi/conda_environment.py +187 -52
- metaflow/plugins/pypi/micromamba.py +157 -47
- metaflow/plugins/pypi/parsers.py +268 -0
- metaflow/plugins/pypi/pip.py +88 -13
- metaflow/plugins/pypi/pypi_decorator.py +37 -1
- metaflow/plugins/pypi/utils.py +48 -2
- metaflow/plugins/resources_decorator.py +2 -2
- metaflow/plugins/secrets/__init__.py +3 -0
- metaflow/plugins/secrets/secrets_decorator.py +26 -181
- metaflow/plugins/secrets/secrets_func.py +49 -0
- metaflow/plugins/secrets/secrets_spec.py +101 -0
- metaflow/plugins/secrets/utils.py +74 -0
- metaflow/plugins/tag_cli.py +4 -7
- metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
- metaflow/plugins/timeout_decorator.py +3 -3
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +128 -0
- metaflow/plugins/uv/uv_environment.py +72 -0
- metaflow/procpoll.py +1 -1
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/__init__.py +0 -0
- metaflow/runner/click_api.py +717 -0
- metaflow/runner/deployer.py +470 -0
- metaflow/runner/deployer_impl.py +201 -0
- metaflow/runner/metaflow_runner.py +714 -0
- metaflow/runner/nbdeploy.py +132 -0
- metaflow/runner/nbrun.py +225 -0
- metaflow/runner/subprocess_manager.py +650 -0
- metaflow/runner/utils.py +335 -0
- metaflow/runtime.py +1078 -260
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/system/__init__.py +5 -0
- metaflow/system/system_logger.py +85 -0
- metaflow/system/system_monitor.py +108 -0
- metaflow/system/system_utils.py +19 -0
- metaflow/task.py +521 -225
- metaflow/tracing/__init__.py +7 -7
- metaflow/tracing/span_exporter.py +31 -38
- metaflow/tracing/tracing_modules.py +38 -43
- metaflow/tuple_util.py +27 -0
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_options.py +563 -0
- metaflow/user_configs/config_parameters.py +598 -0
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +512 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +264 -0
- metaflow/user_decorators/user_step_decorator.py +749 -0
- metaflow/util.py +243 -27
- metaflow/vendor.py +23 -7
- metaflow/version.py +1 -1
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
- ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
- ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
- metaflow/_vendor/v3_5/__init__.py +0 -1
- metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
- metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
- metaflow/package.py +0 -188
- ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
- ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
- /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
- /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
- /metaflow/{metadata → metadata_provider}/util.py +0 -0
- /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
import shlex
|
|
6
5
|
import time
|
|
7
|
-
import copy
|
|
8
|
-
from typing import Dict, List, Optional
|
|
9
|
-
import uuid
|
|
10
6
|
from uuid import uuid4
|
|
11
7
|
|
|
12
8
|
from metaflow import current, util
|
|
@@ -15,10 +11,13 @@ from metaflow.metaflow_config import (
|
|
|
15
11
|
ARGO_EVENTS_EVENT,
|
|
16
12
|
ARGO_EVENTS_EVENT_BUS,
|
|
17
13
|
ARGO_EVENTS_EVENT_SOURCE,
|
|
18
|
-
ARGO_EVENTS_SERVICE_ACCOUNT,
|
|
19
14
|
ARGO_EVENTS_INTERNAL_WEBHOOK_URL,
|
|
20
|
-
|
|
15
|
+
ARGO_EVENTS_SERVICE_ACCOUNT,
|
|
21
16
|
ARGO_EVENTS_WEBHOOK_AUTH,
|
|
17
|
+
ARGO_WORKFLOWS_KUBERNETES_SECRETS,
|
|
18
|
+
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
|
|
19
|
+
AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
20
|
+
AZURE_KEY_VAULT_PREFIX,
|
|
22
21
|
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
23
22
|
CARD_AZUREROOT,
|
|
24
23
|
CARD_GSROOT,
|
|
@@ -33,16 +32,16 @@ from metaflow.metaflow_config import (
|
|
|
33
32
|
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
34
33
|
GCP_SECRET_MANAGER_PREFIX,
|
|
35
34
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
36
|
-
KUBERNETES_LABELS,
|
|
37
35
|
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
36
|
+
OTEL_ENDPOINT,
|
|
38
37
|
S3_ENDPOINT_URL,
|
|
38
|
+
S3_SERVER_SIDE_ENCRYPTION,
|
|
39
39
|
SERVICE_HEADERS,
|
|
40
|
+
KUBERNETES_SECRETS,
|
|
40
41
|
SERVICE_INTERNAL_URL,
|
|
41
|
-
S3_SERVER_SIDE_ENCRYPTION,
|
|
42
|
-
OTEL_ENDPOINT,
|
|
43
42
|
)
|
|
44
|
-
from metaflow.
|
|
45
|
-
|
|
43
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
44
|
+
from metaflow.metaflow_config_funcs import config_values, init_config
|
|
46
45
|
from metaflow.mflog import (
|
|
47
46
|
BASH_SAVE_LOGS,
|
|
48
47
|
bash_capture_logs,
|
|
@@ -60,6 +59,10 @@ STDERR_FILE = "mflog_stderr"
|
|
|
60
59
|
STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
|
|
61
60
|
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
|
|
62
61
|
|
|
62
|
+
METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE = (
|
|
63
|
+
"{METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE}"
|
|
64
|
+
)
|
|
65
|
+
|
|
63
66
|
|
|
64
67
|
class KubernetesException(MetaflowException):
|
|
65
68
|
headline = "Kubernetes error"
|
|
@@ -87,6 +90,7 @@ class Kubernetes(object):
|
|
|
87
90
|
step_name,
|
|
88
91
|
task_id,
|
|
89
92
|
attempt,
|
|
93
|
+
code_package_metadata,
|
|
90
94
|
code_package_url,
|
|
91
95
|
step_cmds,
|
|
92
96
|
):
|
|
@@ -101,7 +105,7 @@ class Kubernetes(object):
|
|
|
101
105
|
stderr_path=STDERR_PATH,
|
|
102
106
|
)
|
|
103
107
|
init_cmds = self._environment.get_package_commands(
|
|
104
|
-
code_package_url, self._datastore.TYPE
|
|
108
|
+
code_package_url, self._datastore.TYPE, code_package_metadata
|
|
105
109
|
)
|
|
106
110
|
init_expr = " && ".join(init_cmds)
|
|
107
111
|
step_expr = bash_capture_logs(
|
|
@@ -143,9 +147,335 @@ class Kubernetes(object):
|
|
|
143
147
|
return shlex.split('bash -c "%s"' % cmd_str)
|
|
144
148
|
|
|
145
149
|
def launch_job(self, **kwargs):
|
|
146
|
-
|
|
150
|
+
if (
|
|
151
|
+
"num_parallel" in kwargs
|
|
152
|
+
and kwargs["num_parallel"]
|
|
153
|
+
and int(kwargs["num_parallel"]) > 0
|
|
154
|
+
):
|
|
155
|
+
self._job = self.create_jobset(**kwargs).execute()
|
|
156
|
+
else:
|
|
157
|
+
kwargs.pop("num_parallel", None)
|
|
158
|
+
kwargs["name_pattern"] = "t-{uid}-".format(uid=str(uuid4())[:8])
|
|
159
|
+
self._job = self.create_job_object(**kwargs).create().execute()
|
|
160
|
+
|
|
161
|
+
def create_jobset(
|
|
162
|
+
self,
|
|
163
|
+
flow_name,
|
|
164
|
+
run_id,
|
|
165
|
+
step_name,
|
|
166
|
+
task_id,
|
|
167
|
+
attempt,
|
|
168
|
+
user,
|
|
169
|
+
code_package_metadata,
|
|
170
|
+
code_package_sha,
|
|
171
|
+
code_package_url,
|
|
172
|
+
code_package_ds,
|
|
173
|
+
docker_image,
|
|
174
|
+
docker_image_pull_policy,
|
|
175
|
+
image_pull_secrets=None,
|
|
176
|
+
step_cli=None,
|
|
177
|
+
service_account=None,
|
|
178
|
+
secrets=None,
|
|
179
|
+
node_selector=None,
|
|
180
|
+
namespace=None,
|
|
181
|
+
cpu=None,
|
|
182
|
+
gpu=None,
|
|
183
|
+
gpu_vendor=None,
|
|
184
|
+
disk=None,
|
|
185
|
+
memory=None,
|
|
186
|
+
use_tmpfs=None,
|
|
187
|
+
tmpfs_tempdir=None,
|
|
188
|
+
tmpfs_size=None,
|
|
189
|
+
tmpfs_path=None,
|
|
190
|
+
run_time_limit=None,
|
|
191
|
+
env=None,
|
|
192
|
+
persistent_volume_claims=None,
|
|
193
|
+
tolerations=None,
|
|
194
|
+
labels=None,
|
|
195
|
+
annotations=None,
|
|
196
|
+
shared_memory=None,
|
|
197
|
+
port=None,
|
|
198
|
+
num_parallel=None,
|
|
199
|
+
qos=None,
|
|
200
|
+
security_context=None,
|
|
201
|
+
):
|
|
202
|
+
name = "js-%s" % str(uuid4())[:6]
|
|
203
|
+
jobset = (
|
|
204
|
+
KubernetesClient()
|
|
205
|
+
.jobset(
|
|
206
|
+
name=name,
|
|
207
|
+
namespace=namespace,
|
|
208
|
+
service_account=service_account,
|
|
209
|
+
node_selector=node_selector,
|
|
210
|
+
image=docker_image,
|
|
211
|
+
image_pull_policy=docker_image_pull_policy,
|
|
212
|
+
image_pull_secrets=image_pull_secrets,
|
|
213
|
+
cpu=cpu,
|
|
214
|
+
memory=memory,
|
|
215
|
+
disk=disk,
|
|
216
|
+
gpu=gpu,
|
|
217
|
+
gpu_vendor=gpu_vendor,
|
|
218
|
+
timeout_in_seconds=run_time_limit,
|
|
219
|
+
# Retries are handled by Metaflow runtime
|
|
220
|
+
retries=0,
|
|
221
|
+
step_name=step_name,
|
|
222
|
+
# We set the jobset name as the subdomain.
|
|
223
|
+
# todo: [final-refactor] ask @shri what was the motive when we did initial implementation
|
|
224
|
+
subdomain=name,
|
|
225
|
+
tolerations=tolerations,
|
|
226
|
+
use_tmpfs=use_tmpfs,
|
|
227
|
+
tmpfs_tempdir=tmpfs_tempdir,
|
|
228
|
+
tmpfs_size=tmpfs_size,
|
|
229
|
+
tmpfs_path=tmpfs_path,
|
|
230
|
+
persistent_volume_claims=persistent_volume_claims,
|
|
231
|
+
shared_memory=shared_memory,
|
|
232
|
+
port=port,
|
|
233
|
+
num_parallel=num_parallel,
|
|
234
|
+
qos=qos,
|
|
235
|
+
security_context=security_context,
|
|
236
|
+
)
|
|
237
|
+
.environment_variable("METAFLOW_CODE_METADATA", code_package_metadata)
|
|
238
|
+
.environment_variable("METAFLOW_CODE_SHA", code_package_sha)
|
|
239
|
+
.environment_variable("METAFLOW_CODE_URL", code_package_url)
|
|
240
|
+
.environment_variable("METAFLOW_CODE_DS", code_package_ds)
|
|
241
|
+
.environment_variable("METAFLOW_USER", user)
|
|
242
|
+
.environment_variable("METAFLOW_SERVICE_URL", SERVICE_INTERNAL_URL)
|
|
243
|
+
.environment_variable(
|
|
244
|
+
"METAFLOW_SERVICE_HEADERS",
|
|
245
|
+
json.dumps(SERVICE_HEADERS),
|
|
246
|
+
)
|
|
247
|
+
.environment_variable("METAFLOW_DATASTORE_SYSROOT_S3", DATASTORE_SYSROOT_S3)
|
|
248
|
+
.environment_variable("METAFLOW_DATATOOLS_S3ROOT", DATATOOLS_S3ROOT)
|
|
249
|
+
.environment_variable("METAFLOW_DEFAULT_DATASTORE", self._datastore.TYPE)
|
|
250
|
+
.environment_variable("METAFLOW_DEFAULT_METADATA", DEFAULT_METADATA)
|
|
251
|
+
.environment_variable("METAFLOW_KUBERNETES_WORKLOAD", 1)
|
|
252
|
+
.environment_variable(
|
|
253
|
+
"METAFLOW_KUBERNETES_FETCH_EC2_METADATA", KUBERNETES_FETCH_EC2_METADATA
|
|
254
|
+
)
|
|
255
|
+
.environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "kubernetes")
|
|
256
|
+
.environment_variable(
|
|
257
|
+
"METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE", DEFAULT_SECRETS_BACKEND_TYPE
|
|
258
|
+
)
|
|
259
|
+
.environment_variable("METAFLOW_CARD_S3ROOT", CARD_S3ROOT)
|
|
260
|
+
.environment_variable(
|
|
261
|
+
"METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER", DEFAULT_AWS_CLIENT_PROVIDER
|
|
262
|
+
)
|
|
263
|
+
.environment_variable(
|
|
264
|
+
"METAFLOW_DEFAULT_GCP_CLIENT_PROVIDER", DEFAULT_GCP_CLIENT_PROVIDER
|
|
265
|
+
)
|
|
266
|
+
.environment_variable(
|
|
267
|
+
"METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION",
|
|
268
|
+
AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
269
|
+
)
|
|
270
|
+
.environment_variable(
|
|
271
|
+
"METAFLOW_GCP_SECRET_MANAGER_PREFIX", GCP_SECRET_MANAGER_PREFIX
|
|
272
|
+
)
|
|
273
|
+
.environment_variable(
|
|
274
|
+
"METAFLOW_AZURE_KEY_VAULT_PREFIX", AZURE_KEY_VAULT_PREFIX
|
|
275
|
+
)
|
|
276
|
+
.environment_variable("METAFLOW_S3_ENDPOINT_URL", S3_ENDPOINT_URL)
|
|
277
|
+
.environment_variable(
|
|
278
|
+
"METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT",
|
|
279
|
+
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
280
|
+
)
|
|
281
|
+
.environment_variable(
|
|
282
|
+
"METAFLOW_DATASTORE_SYSROOT_AZURE", DATASTORE_SYSROOT_AZURE
|
|
283
|
+
)
|
|
284
|
+
.environment_variable("METAFLOW_CARD_AZUREROOT", CARD_AZUREROOT)
|
|
285
|
+
.environment_variable("METAFLOW_DATASTORE_SYSROOT_GS", DATASTORE_SYSROOT_GS)
|
|
286
|
+
.environment_variable("METAFLOW_CARD_GSROOT", CARD_GSROOT)
|
|
287
|
+
# support Metaflow sandboxes
|
|
288
|
+
.environment_variable(
|
|
289
|
+
"METAFLOW_INIT_SCRIPT", KUBERNETES_SANDBOX_INIT_SCRIPT
|
|
290
|
+
)
|
|
291
|
+
.environment_variable(
|
|
292
|
+
"METAFLOW_KUBERNETES_SANDBOX_INIT_SCRIPT",
|
|
293
|
+
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
294
|
+
)
|
|
295
|
+
.environment_variable(
|
|
296
|
+
"METAFLOW_ARGO_WORKFLOWS_KUBERNETES_SECRETS",
|
|
297
|
+
ARGO_WORKFLOWS_KUBERNETES_SECRETS,
|
|
298
|
+
)
|
|
299
|
+
.environment_variable(
|
|
300
|
+
"METAFLOW_ARGO_WORKFLOWS_ENV_VARS_TO_SKIP",
|
|
301
|
+
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
|
|
302
|
+
)
|
|
303
|
+
.environment_variable("METAFLOW_OTEL_ENDPOINT", OTEL_ENDPOINT)
|
|
304
|
+
# Skip setting METAFLOW_DATASTORE_SYSROOT_LOCAL because metadata sync
|
|
305
|
+
# between the local user instance and the remote Kubernetes pod
|
|
306
|
+
# assumes metadata is stored in DATASTORE_LOCAL_DIR on the Kubernetes
|
|
307
|
+
# pod; this happens when METAFLOW_DATASTORE_SYSROOT_LOCAL is NOT set (
|
|
308
|
+
# see get_datastore_root_from_config in datastore/local.py).
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
for k in list(
|
|
312
|
+
[] if not secrets else [secrets] if isinstance(secrets, str) else secrets
|
|
313
|
+
) + KUBERNETES_SECRETS.split(","):
|
|
314
|
+
jobset.secret(k)
|
|
315
|
+
|
|
316
|
+
initial_configs = init_config()
|
|
317
|
+
for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
|
|
318
|
+
if entry not in initial_configs:
|
|
319
|
+
raise KubernetesException(
|
|
320
|
+
f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
additional_obp_configs = {
|
|
324
|
+
"OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
|
|
325
|
+
"OBP_INTEGRATIONS_URL": initial_configs[
|
|
326
|
+
"OBP_INTEGRATIONS_URL"
|
|
327
|
+
],
|
|
328
|
+
}
|
|
329
|
+
for k, v in additional_obp_configs.items():
|
|
330
|
+
jobset.environment_variable(k, v)
|
|
331
|
+
|
|
332
|
+
jobset.environment_variables_from_selectors(
|
|
333
|
+
{
|
|
334
|
+
"METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
|
|
335
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
336
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
337
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
338
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
339
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Temporary passing of *some* environment variables. Do not rely on this
|
|
344
|
+
# mechanism as it will be removed in the near future
|
|
345
|
+
for k, v in config_values():
|
|
346
|
+
if k.startswith("METAFLOW_CONDA_") or k.startswith("METAFLOW_DEBUG_"):
|
|
347
|
+
jobset.environment_variable(k, v)
|
|
348
|
+
|
|
349
|
+
if S3_SERVER_SIDE_ENCRYPTION is not None:
|
|
350
|
+
jobset.environment_variable(
|
|
351
|
+
"METAFLOW_S3_SERVER_SIDE_ENCRYPTION", S3_SERVER_SIDE_ENCRYPTION
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Set environment variables to support metaflow.integrations.ArgoEvent
|
|
355
|
+
jobset.environment_variable(
|
|
356
|
+
"METAFLOW_ARGO_EVENTS_WEBHOOK_URL", ARGO_EVENTS_INTERNAL_WEBHOOK_URL
|
|
357
|
+
)
|
|
358
|
+
jobset.environment_variable("METAFLOW_ARGO_EVENTS_EVENT", ARGO_EVENTS_EVENT)
|
|
359
|
+
jobset.environment_variable(
|
|
360
|
+
"METAFLOW_ARGO_EVENTS_EVENT_BUS", ARGO_EVENTS_EVENT_BUS
|
|
361
|
+
)
|
|
362
|
+
jobset.environment_variable(
|
|
363
|
+
"METAFLOW_ARGO_EVENTS_EVENT_SOURCE", ARGO_EVENTS_EVENT_SOURCE
|
|
364
|
+
)
|
|
365
|
+
jobset.environment_variable(
|
|
366
|
+
"METAFLOW_ARGO_EVENTS_SERVICE_ACCOUNT", ARGO_EVENTS_SERVICE_ACCOUNT
|
|
367
|
+
)
|
|
368
|
+
jobset.environment_variable(
|
|
369
|
+
"METAFLOW_ARGO_EVENTS_WEBHOOK_AUTH",
|
|
370
|
+
ARGO_EVENTS_WEBHOOK_AUTH,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
## -----Jobset specific env vars START here-----
|
|
374
|
+
jobset.environment_variable("MF_MASTER_ADDR", jobset.jobset_control_addr)
|
|
375
|
+
jobset.environment_variable("MF_MASTER_PORT", str(port))
|
|
376
|
+
jobset.environment_variable("MF_WORLD_SIZE", str(num_parallel))
|
|
377
|
+
jobset.environment_variable_from_selector(
|
|
378
|
+
"JOBSET_RESTART_ATTEMPT",
|
|
379
|
+
"metadata.annotations['jobset.sigs.k8s.io/restart-attempt']",
|
|
380
|
+
)
|
|
381
|
+
jobset.environment_variable_from_selector(
|
|
382
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME",
|
|
383
|
+
"metadata.annotations['jobset.sigs.k8s.io/jobset-name']",
|
|
384
|
+
)
|
|
385
|
+
jobset.environment_variable_from_selector(
|
|
386
|
+
"MF_WORKER_REPLICA_INDEX",
|
|
387
|
+
"metadata.annotations['jobset.sigs.k8s.io/job-index']",
|
|
388
|
+
)
|
|
389
|
+
## -----Jobset specific env vars END here-----
|
|
390
|
+
|
|
391
|
+
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
|
392
|
+
if tmpfs_enabled and tmpfs_tempdir:
|
|
393
|
+
jobset.environment_variable("METAFLOW_TEMPDIR", tmpfs_path)
|
|
394
|
+
|
|
395
|
+
for name, value in env.items():
|
|
396
|
+
jobset.environment_variable(name, value)
|
|
397
|
+
|
|
398
|
+
system_annotations = {
|
|
399
|
+
"metaflow/user": user,
|
|
400
|
+
"metaflow/flow_name": flow_name,
|
|
401
|
+
"metaflow/control-task-id": task_id,
|
|
402
|
+
"metaflow/run_id": run_id,
|
|
403
|
+
"metaflow/step_name": step_name,
|
|
404
|
+
"metaflow/attempt": attempt,
|
|
405
|
+
}
|
|
406
|
+
if current.get("project_name"):
|
|
407
|
+
system_annotations.update(
|
|
408
|
+
{
|
|
409
|
+
"metaflow/project_name": current.project_name,
|
|
410
|
+
"metaflow/branch_name": current.branch_name,
|
|
411
|
+
"metaflow/project_flow_name": current.project_flow_name,
|
|
412
|
+
}
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
system_labels = {
|
|
416
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
417
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
jobset.labels({**({} if not labels else labels), **system_labels})
|
|
421
|
+
|
|
422
|
+
jobset.annotations(
|
|
423
|
+
{**({} if not annotations else annotations), **system_annotations}
|
|
424
|
+
)
|
|
425
|
+
# We need this task-id set so that all the nodes are aware of the control
|
|
426
|
+
# task's task-id. These "MF_" variables populate the `current.parallel` namedtuple
|
|
427
|
+
jobset.environment_variable("MF_PARALLEL_CONTROL_TASK_ID", str(task_id))
|
|
428
|
+
|
|
429
|
+
## ----------- control/worker specific values START here -----------
|
|
430
|
+
# We will now set the appropriate command for the control/worker job
|
|
431
|
+
_get_command = lambda index, _tskid: self._command(
|
|
432
|
+
flow_name=flow_name,
|
|
433
|
+
run_id=run_id,
|
|
434
|
+
step_name=step_name,
|
|
435
|
+
task_id=_tskid,
|
|
436
|
+
attempt=attempt,
|
|
437
|
+
code_package_metadata=code_package_metadata,
|
|
438
|
+
code_package_url=code_package_url,
|
|
439
|
+
step_cmds=[
|
|
440
|
+
step_cli.replace(
|
|
441
|
+
METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE,
|
|
442
|
+
"--ubf-context $UBF_CONTEXT --split-index %s --task-id %s"
|
|
443
|
+
% (index, _tskid),
|
|
444
|
+
)
|
|
445
|
+
],
|
|
446
|
+
)
|
|
447
|
+
jobset.control.replicas(1)
|
|
448
|
+
jobset.worker.replicas(num_parallel - 1)
|
|
449
|
+
|
|
450
|
+
# We set the appropriate command for the control/worker job
|
|
451
|
+
# and also set the task-id/spit-index for the control/worker job
|
|
452
|
+
# appropirately.
|
|
453
|
+
jobset.control.command(_get_command("0", str(task_id)))
|
|
454
|
+
jobset.worker.command(
|
|
455
|
+
_get_command(
|
|
456
|
+
"`expr $[MF_WORKER_REPLICA_INDEX] + 1`",
|
|
457
|
+
"-".join(
|
|
458
|
+
[
|
|
459
|
+
str(task_id),
|
|
460
|
+
"worker",
|
|
461
|
+
"$MF_WORKER_REPLICA_INDEX",
|
|
462
|
+
]
|
|
463
|
+
),
|
|
464
|
+
)
|
|
465
|
+
)
|
|
147
466
|
|
|
148
|
-
|
|
467
|
+
jobset.control.environment_variable("UBF_CONTEXT", UBF_CONTROL)
|
|
468
|
+
jobset.worker.environment_variable("UBF_CONTEXT", UBF_TASK)
|
|
469
|
+
# Every control job requires an environment variable of MF_CONTROL_INDEX
|
|
470
|
+
# set to 0 so that we can derive the MF_PARALLEL_NODE_INDEX correctly.
|
|
471
|
+
# Since only the control job has MF_CONTROL_INDE set to 0, all worker nodes
|
|
472
|
+
# will use MF_WORKER_REPLICA_INDEX
|
|
473
|
+
jobset.control.environment_variable("MF_CONTROL_INDEX", "0")
|
|
474
|
+
## ----------- control/worker specific values END here -----------
|
|
475
|
+
|
|
476
|
+
return jobset
|
|
477
|
+
|
|
478
|
+
def create_job_object(
|
|
149
479
|
self,
|
|
150
480
|
flow_name,
|
|
151
481
|
run_id,
|
|
@@ -153,12 +483,14 @@ class Kubernetes(object):
|
|
|
153
483
|
task_id,
|
|
154
484
|
attempt,
|
|
155
485
|
user,
|
|
486
|
+
code_package_metadata,
|
|
156
487
|
code_package_sha,
|
|
157
488
|
code_package_url,
|
|
158
489
|
code_package_ds,
|
|
159
490
|
step_cli,
|
|
160
491
|
docker_image,
|
|
161
492
|
docker_image_pull_policy,
|
|
493
|
+
image_pull_secrets=None,
|
|
162
494
|
service_account=None,
|
|
163
495
|
secrets=None,
|
|
164
496
|
node_selector=None,
|
|
@@ -177,19 +509,19 @@ class Kubernetes(object):
|
|
|
177
509
|
persistent_volume_claims=None,
|
|
178
510
|
tolerations=None,
|
|
179
511
|
labels=None,
|
|
180
|
-
annotations=None,
|
|
181
|
-
num_parallel=0,
|
|
182
|
-
attrs={},
|
|
183
512
|
shared_memory=None,
|
|
184
513
|
port=None,
|
|
514
|
+
name_pattern=None,
|
|
515
|
+
qos=None,
|
|
516
|
+
annotations=None,
|
|
517
|
+
security_context=None,
|
|
185
518
|
):
|
|
186
519
|
if env is None:
|
|
187
520
|
env = {}
|
|
188
|
-
|
|
189
521
|
job = (
|
|
190
522
|
KubernetesClient()
|
|
191
523
|
.job(
|
|
192
|
-
generate_name=
|
|
524
|
+
generate_name=name_pattern,
|
|
193
525
|
namespace=namespace,
|
|
194
526
|
service_account=service_account,
|
|
195
527
|
secrets=secrets,
|
|
@@ -200,11 +532,13 @@ class Kubernetes(object):
|
|
|
200
532
|
step_name=step_name,
|
|
201
533
|
task_id=task_id,
|
|
202
534
|
attempt=attempt,
|
|
535
|
+
code_package_metadata=code_package_metadata,
|
|
203
536
|
code_package_url=code_package_url,
|
|
204
537
|
step_cmds=[step_cli],
|
|
205
538
|
),
|
|
206
539
|
image=docker_image,
|
|
207
540
|
image_pull_policy=docker_image_pull_policy,
|
|
541
|
+
image_pull_secrets=image_pull_secrets,
|
|
208
542
|
cpu=cpu,
|
|
209
543
|
memory=memory,
|
|
210
544
|
disk=disk,
|
|
@@ -215,17 +549,19 @@ class Kubernetes(object):
|
|
|
215
549
|
retries=0,
|
|
216
550
|
step_name=step_name,
|
|
217
551
|
tolerations=tolerations,
|
|
218
|
-
labels=
|
|
552
|
+
labels=labels,
|
|
553
|
+
annotations=annotations,
|
|
219
554
|
use_tmpfs=use_tmpfs,
|
|
220
555
|
tmpfs_tempdir=tmpfs_tempdir,
|
|
221
556
|
tmpfs_size=tmpfs_size,
|
|
222
557
|
tmpfs_path=tmpfs_path,
|
|
223
558
|
persistent_volume_claims=persistent_volume_claims,
|
|
224
|
-
num_parallel=num_parallel,
|
|
225
|
-
attrs=attrs,
|
|
226
559
|
shared_memory=shared_memory,
|
|
227
560
|
port=port,
|
|
561
|
+
qos=qos,
|
|
562
|
+
security_context=security_context,
|
|
228
563
|
)
|
|
564
|
+
.environment_variable("METAFLOW_CODE_METADATA", code_package_metadata)
|
|
229
565
|
.environment_variable("METAFLOW_CODE_SHA", code_package_sha)
|
|
230
566
|
.environment_variable("METAFLOW_CODE_URL", code_package_url)
|
|
231
567
|
.environment_variable("METAFLOW_CODE_DS", code_package_ds)
|
|
@@ -261,6 +597,9 @@ class Kubernetes(object):
|
|
|
261
597
|
.environment_variable(
|
|
262
598
|
"METAFLOW_GCP_SECRET_MANAGER_PREFIX", GCP_SECRET_MANAGER_PREFIX
|
|
263
599
|
)
|
|
600
|
+
.environment_variable(
|
|
601
|
+
"METAFLOW_AZURE_KEY_VAULT_PREFIX", AZURE_KEY_VAULT_PREFIX
|
|
602
|
+
)
|
|
264
603
|
.environment_variable("METAFLOW_S3_ENDPOINT_URL", S3_ENDPOINT_URL)
|
|
265
604
|
.environment_variable(
|
|
266
605
|
"METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT",
|
|
@@ -276,6 +615,18 @@ class Kubernetes(object):
|
|
|
276
615
|
.environment_variable(
|
|
277
616
|
"METAFLOW_INIT_SCRIPT", KUBERNETES_SANDBOX_INIT_SCRIPT
|
|
278
617
|
)
|
|
618
|
+
.environment_variable(
|
|
619
|
+
"METAFLOW_KUBERNETES_SANDBOX_INIT_SCRIPT",
|
|
620
|
+
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
621
|
+
)
|
|
622
|
+
.environment_variable(
|
|
623
|
+
"METAFLOW_ARGO_WORKFLOWS_KUBERNETES_SECRETS",
|
|
624
|
+
ARGO_WORKFLOWS_KUBERNETES_SECRETS,
|
|
625
|
+
)
|
|
626
|
+
.environment_variable(
|
|
627
|
+
"METAFLOW_ARGO_WORKFLOWS_ENV_VARS_TO_SKIP",
|
|
628
|
+
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
|
|
629
|
+
)
|
|
279
630
|
.environment_variable("METAFLOW_OTEL_ENDPOINT", OTEL_ENDPOINT)
|
|
280
631
|
# Skip setting METAFLOW_DATASTORE_SYSROOT_LOCAL because metadata sync
|
|
281
632
|
# between the local user instance and the remote Kubernetes pod
|
|
@@ -284,7 +635,6 @@ class Kubernetes(object):
|
|
|
284
635
|
# see get_datastore_root_from_config in datastore/local.py).
|
|
285
636
|
)
|
|
286
637
|
|
|
287
|
-
self.num_parallel = num_parallel
|
|
288
638
|
# Temporary passing of *some* environment variables. Do not rely on this
|
|
289
639
|
# mechanism as it will be removed in the near future
|
|
290
640
|
for k, v in config_values():
|
|
@@ -321,13 +671,25 @@ class Kubernetes(object):
|
|
|
321
671
|
|
|
322
672
|
for name, value in env.items():
|
|
323
673
|
job.environment_variable(name, value)
|
|
674
|
+
# Add job specific labels
|
|
675
|
+
system_labels = {
|
|
676
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
677
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
678
|
+
}
|
|
679
|
+
for name, value in system_labels.items():
|
|
680
|
+
job.label(name, value)
|
|
324
681
|
|
|
325
|
-
annotations
|
|
326
|
-
|
|
682
|
+
# Add job specific annotations not set in the decorator.
|
|
683
|
+
system_annotations = {
|
|
327
684
|
"metaflow/flow_name": flow_name,
|
|
685
|
+
"metaflow/run_id": run_id,
|
|
686
|
+
"metaflow/step_name": step_name,
|
|
687
|
+
"metaflow/task_id": task_id,
|
|
688
|
+
"metaflow/attempt": attempt,
|
|
689
|
+
"metaflow/user": user,
|
|
328
690
|
}
|
|
329
691
|
if current.get("project_name"):
|
|
330
|
-
|
|
692
|
+
system_annotations.update(
|
|
331
693
|
{
|
|
332
694
|
"metaflow/project_name": current.project_name,
|
|
333
695
|
"metaflow/branch_name": current.branch_name,
|
|
@@ -335,18 +697,12 @@ class Kubernetes(object):
|
|
|
335
697
|
}
|
|
336
698
|
)
|
|
337
699
|
|
|
338
|
-
for name, value in
|
|
700
|
+
for name, value in system_annotations.items():
|
|
339
701
|
job.annotation(name, value)
|
|
340
702
|
|
|
341
|
-
|
|
342
|
-
job.annotation("metaflow/run_id", run_id)
|
|
343
|
-
.annotation("metaflow/step_name", step_name)
|
|
344
|
-
.annotation("metaflow/task_id", task_id)
|
|
345
|
-
.annotation("metaflow/attempt", attempt)
|
|
346
|
-
.label("app.kubernetes.io/name", "metaflow-task")
|
|
347
|
-
.label("app.kubernetes.io/part-of", "metaflow")
|
|
348
|
-
)
|
|
703
|
+
return job
|
|
349
704
|
|
|
705
|
+
def create_k8sjob(self, job):
|
|
350
706
|
return job.create()
|
|
351
707
|
|
|
352
708
|
def wait(self, stdout_location, stderr_location, echo=None):
|
|
@@ -360,7 +716,7 @@ class Kubernetes(object):
|
|
|
360
716
|
sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
|
|
361
717
|
return 0.5 + sigmoid * 30.0
|
|
362
718
|
|
|
363
|
-
def wait_for_launch(job
|
|
719
|
+
def wait_for_launch(job):
|
|
364
720
|
status = job.status
|
|
365
721
|
echo(
|
|
366
722
|
"Task is starting (%s)..." % status,
|
|
@@ -370,60 +726,43 @@ class Kubernetes(object):
|
|
|
370
726
|
t = time.time()
|
|
371
727
|
start_time = time.time()
|
|
372
728
|
while job.is_waiting:
|
|
373
|
-
|
|
374
|
-
if status !=
|
|
375
|
-
|
|
376
|
-
child_statuses = ""
|
|
377
|
-
else:
|
|
378
|
-
status_keys = set(
|
|
379
|
-
[child_job.status for child_job in child_jobs]
|
|
380
|
-
)
|
|
381
|
-
status_counts = [
|
|
382
|
-
(
|
|
383
|
-
status,
|
|
384
|
-
len(
|
|
385
|
-
[
|
|
386
|
-
child_job.status == status
|
|
387
|
-
for child_job in child_jobs
|
|
388
|
-
]
|
|
389
|
-
),
|
|
390
|
-
)
|
|
391
|
-
for status in status_keys
|
|
392
|
-
]
|
|
393
|
-
child_statuses = " (parallel node status: [{}])".format(
|
|
394
|
-
", ".join(
|
|
395
|
-
[
|
|
396
|
-
"{}:{}".format(status, num)
|
|
397
|
-
for (status, num) in sorted(status_counts)
|
|
398
|
-
]
|
|
399
|
-
)
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
status = job.status
|
|
729
|
+
new_status = job.status
|
|
730
|
+
if status != new_status or (time.time() - t) > 30:
|
|
731
|
+
status = new_status
|
|
403
732
|
echo(
|
|
404
|
-
"Task is starting (
|
|
733
|
+
"Task is starting (%s)..." % status,
|
|
405
734
|
"stderr",
|
|
406
735
|
job_id=job.id,
|
|
407
736
|
)
|
|
408
737
|
t = time.time()
|
|
409
738
|
time.sleep(update_delay(time.time() - start_time))
|
|
410
739
|
|
|
411
|
-
prefix = b"[%s] " % util.to_bytes(self._job.id)
|
|
740
|
+
prefix = lambda: b"[%s] " % util.to_bytes(self._job.id)
|
|
412
741
|
|
|
413
742
|
stdout_tail = get_log_tailer(stdout_location, self._datastore.TYPE)
|
|
414
743
|
stderr_tail = get_log_tailer(stderr_location, self._datastore.TYPE)
|
|
415
744
|
|
|
416
|
-
child_jobs = []
|
|
417
745
|
# 1) Loop until the job has started
|
|
418
|
-
wait_for_launch(self._job
|
|
746
|
+
wait_for_launch(self._job)
|
|
419
747
|
|
|
420
748
|
# 2) Tail logs until the job has finished
|
|
749
|
+
self._output_final_logs = False
|
|
750
|
+
|
|
751
|
+
def _has_updates():
|
|
752
|
+
if self._job.is_running:
|
|
753
|
+
return True
|
|
754
|
+
# Make sure to output final tail for a job that has finished.
|
|
755
|
+
if not self._output_final_logs:
|
|
756
|
+
self._output_final_logs = True
|
|
757
|
+
return True
|
|
758
|
+
return False
|
|
759
|
+
|
|
421
760
|
tail_logs(
|
|
422
|
-
prefix=prefix,
|
|
761
|
+
prefix=prefix(),
|
|
423
762
|
stdout_tail=stdout_tail,
|
|
424
763
|
stderr_tail=stderr_tail,
|
|
425
764
|
echo=echo,
|
|
426
|
-
has_log_updates=
|
|
765
|
+
has_log_updates=_has_updates,
|
|
427
766
|
)
|
|
428
767
|
# 3) Fetch remaining logs
|
|
429
768
|
#
|
|
@@ -435,7 +774,6 @@ class Kubernetes(object):
|
|
|
435
774
|
# exists prior to calling S3Tail and note the user about
|
|
436
775
|
# truncated logs if it doesn't.
|
|
437
776
|
# TODO : For hard crashes, we can fetch logs from the pod.
|
|
438
|
-
|
|
439
777
|
if self._job.has_failed:
|
|
440
778
|
exit_code, reason = self._job.reason
|
|
441
779
|
msg = next(
|
|
@@ -469,60 +807,3 @@ class Kubernetes(object):
|
|
|
469
807
|
"stderr",
|
|
470
808
|
job_id=self._job.id,
|
|
471
809
|
)
|
|
472
|
-
|
|
473
|
-
@staticmethod
|
|
474
|
-
def _get_labels(extra_labels=None):
|
|
475
|
-
if extra_labels is None:
|
|
476
|
-
extra_labels = {}
|
|
477
|
-
env_labels = KUBERNETES_LABELS.split(",") if KUBERNETES_LABELS else []
|
|
478
|
-
env_labels = parse_kube_keyvalue_list(env_labels, False)
|
|
479
|
-
labels = {**env_labels, **extra_labels}
|
|
480
|
-
validate_kube_labels(labels)
|
|
481
|
-
return labels
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
def validate_kube_labels(
|
|
485
|
-
labels: Optional[Dict[str, Optional[str]]],
|
|
486
|
-
) -> bool:
|
|
487
|
-
"""Validate label values.
|
|
488
|
-
|
|
489
|
-
This validates the kubernetes label values. It does not validate the keys.
|
|
490
|
-
Ideally, keys should be static and also the validation rules for keys are
|
|
491
|
-
more complex than those for values. For full validation rules, see:
|
|
492
|
-
|
|
493
|
-
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
|
|
494
|
-
"""
|
|
495
|
-
|
|
496
|
-
def validate_label(s: Optional[str]):
|
|
497
|
-
regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
|
|
498
|
-
if not s:
|
|
499
|
-
# allow empty label
|
|
500
|
-
return True
|
|
501
|
-
if not re.search(regex_match, s):
|
|
502
|
-
raise KubernetesException(
|
|
503
|
-
'Invalid value: "%s"\n'
|
|
504
|
-
"A valid label must be an empty string or one that\n"
|
|
505
|
-
" - Consist of alphanumeric, '-', '_' or '.' characters\n"
|
|
506
|
-
" - Begins and ends with an alphanumeric character\n"
|
|
507
|
-
" - Is at most 63 characters" % s
|
|
508
|
-
)
|
|
509
|
-
return True
|
|
510
|
-
|
|
511
|
-
return all([validate_label(v) for v in labels.values()]) if labels else True
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
|
|
515
|
-
try:
|
|
516
|
-
ret = {}
|
|
517
|
-
for item_str in items:
|
|
518
|
-
item = item_str.split("=", 1)
|
|
519
|
-
if requires_both:
|
|
520
|
-
item[1] # raise IndexError
|
|
521
|
-
if str(item[0]) in ret:
|
|
522
|
-
raise KubernetesException("Duplicate key found: %s" % str(item[0]))
|
|
523
|
-
ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
|
|
524
|
-
return ret
|
|
525
|
-
except KubernetesException as e:
|
|
526
|
-
raise e
|
|
527
|
-
except (AttributeError, IndexError):
|
|
528
|
-
raise KubernetesException("Unable to parse kubernetes list: %s" % items)
|