ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/R.py +10 -7
- metaflow/__init__.py +40 -25
- metaflow/_vendor/imghdr/__init__.py +186 -0
- metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/typeguard/__init__.py +48 -0
- metaflow/_vendor/typeguard/_checkers.py +1070 -0
- metaflow/_vendor/typeguard/_config.py +108 -0
- metaflow/_vendor/typeguard/_decorators.py +233 -0
- metaflow/_vendor/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/typeguard/_functions.py +308 -0
- metaflow/_vendor/typeguard/_importhook.py +213 -0
- metaflow/_vendor/typeguard/_memo.py +48 -0
- metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
- metaflow/_vendor/typeguard/_suppression.py +86 -0
- metaflow/_vendor/typeguard/_transformer.py +1229 -0
- metaflow/_vendor/typeguard/_union_transformer.py +55 -0
- metaflow/_vendor/typeguard/_utils.py +173 -0
- metaflow/_vendor/typeguard/py.typed +0 -0
- metaflow/_vendor/typing_extensions.py +3641 -0
- metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
- metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
- metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
- metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
- metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
- metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
- metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
- metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
- metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
- metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
- metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
- metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
- metaflow/_vendor/yaml/__init__.py +427 -0
- metaflow/_vendor/yaml/composer.py +139 -0
- metaflow/_vendor/yaml/constructor.py +748 -0
- metaflow/_vendor/yaml/cyaml.py +101 -0
- metaflow/_vendor/yaml/dumper.py +62 -0
- metaflow/_vendor/yaml/emitter.py +1137 -0
- metaflow/_vendor/yaml/error.py +75 -0
- metaflow/_vendor/yaml/events.py +86 -0
- metaflow/_vendor/yaml/loader.py +63 -0
- metaflow/_vendor/yaml/nodes.py +49 -0
- metaflow/_vendor/yaml/parser.py +589 -0
- metaflow/_vendor/yaml/reader.py +185 -0
- metaflow/_vendor/yaml/representer.py +389 -0
- metaflow/_vendor/yaml/resolver.py +227 -0
- metaflow/_vendor/yaml/scanner.py +1435 -0
- metaflow/_vendor/yaml/serializer.py +111 -0
- metaflow/_vendor/yaml/tokens.py +104 -0
- metaflow/cards.py +5 -0
- metaflow/cli.py +331 -785
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +52 -0
- metaflow/cli_components/run_cmds.py +546 -0
- metaflow/cli_components/step_cmd.py +334 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +467 -73
- metaflow/client/filecache.py +75 -35
- metaflow/clone_util.py +7 -1
- metaflow/cmd/code/__init__.py +231 -0
- metaflow/cmd/develop/stub_generator.py +756 -288
- metaflow/cmd/develop/stubs.py +12 -28
- metaflow/cmd/main_cli.py +6 -4
- metaflow/cmd/make_wrapper.py +78 -0
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +41 -10
- metaflow/datastore/datastore_set.py +11 -2
- metaflow/datastore/flow_datastore.py +156 -10
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +154 -39
- metaflow/debug.py +5 -0
- metaflow/decorators.py +404 -78
- metaflow/exception.py +8 -2
- metaflow/extension_support/__init__.py +527 -376
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/extension_support/plugins.py +49 -31
- metaflow/flowspec.py +482 -33
- metaflow/graph.py +210 -42
- metaflow/includefile.py +84 -40
- metaflow/lint.py +141 -22
- metaflow/meta_files.py +13 -0
- metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
- metaflow/{metadata → metadata_provider}/metadata.py +86 -1
- metaflow/metaflow_config.py +175 -28
- metaflow/metaflow_config_funcs.py +51 -3
- metaflow/metaflow_current.py +4 -10
- metaflow/metaflow_environment.py +139 -53
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_profile.py +18 -0
- metaflow/metaflow_version.py +150 -66
- metaflow/mflog/__init__.py +4 -3
- metaflow/mflog/save_logs.py +2 -2
- metaflow/multicore_utils.py +31 -14
- metaflow/package/__init__.py +673 -0
- metaflow/packaging_sys/__init__.py +880 -0
- metaflow/packaging_sys/backend.py +128 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +99 -0
- metaflow/packaging_sys/utils.py +54 -0
- metaflow/packaging_sys/v1.py +527 -0
- metaflow/parameters.py +149 -28
- metaflow/plugins/__init__.py +74 -5
- metaflow/plugins/airflow/airflow.py +40 -25
- metaflow/plugins/airflow/airflow_cli.py +22 -5
- metaflow/plugins/airflow/airflow_decorator.py +1 -1
- metaflow/plugins/airflow/airflow_utils.py +5 -3
- metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
- metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
- metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
- metaflow/plugins/argo/argo_client.py +78 -33
- metaflow/plugins/argo/argo_events.py +6 -6
- metaflow/plugins/argo/argo_workflows.py +2410 -527
- metaflow/plugins/argo/argo_workflows_cli.py +571 -121
- metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
- metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
- metaflow/plugins/argo/capture_error.py +73 -0
- metaflow/plugins/argo/conditional_input_paths.py +35 -0
- metaflow/plugins/argo/exit_hooks.py +209 -0
- metaflow/plugins/argo/jobset_input_paths.py +15 -0
- metaflow/plugins/argo/param_val.py +19 -0
- metaflow/plugins/aws/aws_client.py +10 -3
- metaflow/plugins/aws/aws_utils.py +55 -2
- metaflow/plugins/aws/batch/batch.py +72 -5
- metaflow/plugins/aws/batch/batch_cli.py +33 -10
- metaflow/plugins/aws/batch/batch_client.py +4 -3
- metaflow/plugins/aws/batch/batch_decorator.py +102 -35
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +65 -8
- metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
- metaflow/plugins/azure/azure_exceptions.py +1 -1
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
- metaflow/plugins/azure/azure_tail.py +1 -1
- metaflow/plugins/azure/includefile_support.py +2 -0
- metaflow/plugins/cards/card_cli.py +66 -30
- metaflow/plugins/cards/card_creator.py +25 -1
- metaflow/plugins/cards/card_datastore.py +21 -49
- metaflow/plugins/cards/card_decorator.py +132 -8
- metaflow/plugins/cards/card_modules/basic.py +112 -17
- metaflow/plugins/cards/card_modules/bundle.css +1 -1
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
- metaflow/plugins/cards/card_modules/components.py +665 -28
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
- metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
- metaflow/plugins/cards/card_modules/main.css +1 -0
- metaflow/plugins/cards/card_modules/main.js +68 -49
- metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
- metaflow/plugins/cards/card_modules/test_cards.py +26 -12
- metaflow/plugins/cards/card_server.py +39 -14
- metaflow/plugins/cards/component_serializer.py +2 -9
- metaflow/plugins/cards/metadata.py +22 -0
- metaflow/plugins/catch_decorator.py +9 -0
- metaflow/plugins/datastores/azure_storage.py +10 -1
- metaflow/plugins/datastores/gs_storage.py +6 -2
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/local.py +2 -0
- metaflow/plugins/datatools/s3/s3.py +126 -75
- metaflow/plugins/datatools/s3/s3op.py +254 -121
- metaflow/plugins/env_escape/__init__.py +3 -3
- metaflow/plugins/env_escape/client_modules.py +102 -72
- metaflow/plugins/env_escape/server.py +7 -0
- metaflow/plugins/env_escape/stub.py +24 -5
- metaflow/plugins/events_decorator.py +343 -185
- metaflow/plugins/exit_hook/__init__.py +0 -0
- metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
- metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/gcp/gs_tail.py +10 -6
- metaflow/plugins/gcp/includefile_support.py +3 -0
- metaflow/plugins/kubernetes/kube_utils.py +108 -0
- metaflow/plugins/kubernetes/kubernetes.py +411 -130
- metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
- metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
- metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
- metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/logs_cli.py +359 -0
- metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
- metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/plugins/package_cli.py +36 -24
- metaflow/plugins/parallel_decorator.py +128 -11
- metaflow/plugins/parsers.py +16 -0
- metaflow/plugins/project_decorator.py +51 -5
- metaflow/plugins/pypi/bootstrap.py +357 -105
- metaflow/plugins/pypi/conda_decorator.py +82 -81
- metaflow/plugins/pypi/conda_environment.py +187 -52
- metaflow/plugins/pypi/micromamba.py +157 -47
- metaflow/plugins/pypi/parsers.py +268 -0
- metaflow/plugins/pypi/pip.py +88 -13
- metaflow/plugins/pypi/pypi_decorator.py +37 -1
- metaflow/plugins/pypi/utils.py +48 -2
- metaflow/plugins/resources_decorator.py +2 -2
- metaflow/plugins/secrets/__init__.py +3 -0
- metaflow/plugins/secrets/secrets_decorator.py +26 -181
- metaflow/plugins/secrets/secrets_func.py +49 -0
- metaflow/plugins/secrets/secrets_spec.py +101 -0
- metaflow/plugins/secrets/utils.py +74 -0
- metaflow/plugins/tag_cli.py +4 -7
- metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
- metaflow/plugins/timeout_decorator.py +3 -3
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +128 -0
- metaflow/plugins/uv/uv_environment.py +72 -0
- metaflow/procpoll.py +1 -1
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/__init__.py +0 -0
- metaflow/runner/click_api.py +717 -0
- metaflow/runner/deployer.py +470 -0
- metaflow/runner/deployer_impl.py +201 -0
- metaflow/runner/metaflow_runner.py +714 -0
- metaflow/runner/nbdeploy.py +132 -0
- metaflow/runner/nbrun.py +225 -0
- metaflow/runner/subprocess_manager.py +650 -0
- metaflow/runner/utils.py +335 -0
- metaflow/runtime.py +1078 -260
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/system/__init__.py +5 -0
- metaflow/system/system_logger.py +85 -0
- metaflow/system/system_monitor.py +108 -0
- metaflow/system/system_utils.py +19 -0
- metaflow/task.py +521 -225
- metaflow/tracing/__init__.py +7 -7
- metaflow/tracing/span_exporter.py +31 -38
- metaflow/tracing/tracing_modules.py +38 -43
- metaflow/tuple_util.py +27 -0
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_options.py +563 -0
- metaflow/user_configs/config_parameters.py +598 -0
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +512 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +264 -0
- metaflow/user_decorators/user_step_decorator.py +749 -0
- metaflow/util.py +243 -27
- metaflow/vendor.py +23 -7
- metaflow/version.py +1 -1
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
- ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
- ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
- metaflow/_vendor/v3_5/__init__.py +0 -1
- metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
- metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
- metaflow/package.py +0 -188
- ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
- ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
- /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
- /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
- /metaflow/{metadata → metadata_provider}/util.py +0 -0
- /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -7,26 +7,31 @@ import time
|
|
|
7
7
|
from metaflow import current
|
|
8
8
|
from metaflow.decorators import StepDecorator
|
|
9
9
|
from metaflow.exception import MetaflowException
|
|
10
|
-
from metaflow.
|
|
11
|
-
from metaflow.
|
|
10
|
+
from metaflow.metadata_provider import MetaDatum
|
|
11
|
+
from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
|
|
12
12
|
from metaflow.metaflow_config import (
|
|
13
13
|
DATASTORE_LOCAL_DIR,
|
|
14
|
+
FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
|
|
14
15
|
KUBERNETES_CONTAINER_IMAGE,
|
|
15
16
|
KUBERNETES_CONTAINER_REGISTRY,
|
|
17
|
+
KUBERNETES_CPU,
|
|
18
|
+
KUBERNETES_DISK,
|
|
16
19
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
17
|
-
KUBERNETES_IMAGE_PULL_POLICY,
|
|
18
20
|
KUBERNETES_GPU_VENDOR,
|
|
21
|
+
KUBERNETES_IMAGE_PULL_POLICY,
|
|
22
|
+
KUBERNETES_IMAGE_PULL_SECRETS,
|
|
23
|
+
KUBERNETES_MEMORY,
|
|
24
|
+
KUBERNETES_LABELS,
|
|
25
|
+
KUBERNETES_ANNOTATIONS,
|
|
19
26
|
KUBERNETES_NAMESPACE,
|
|
20
27
|
KUBERNETES_NODE_SELECTOR,
|
|
21
28
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
22
|
-
KUBERNETES_TOLERATIONS,
|
|
23
|
-
KUBERNETES_SERVICE_ACCOUNT,
|
|
24
29
|
KUBERNETES_PORT,
|
|
30
|
+
KUBERNETES_SERVICE_ACCOUNT,
|
|
25
31
|
KUBERNETES_SHARED_MEMORY,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
KUBERNETES_DISK,
|
|
32
|
+
KUBERNETES_TOLERATIONS,
|
|
33
|
+
KUBERNETES_QOS,
|
|
34
|
+
KUBERNETES_CONDA_ARCH,
|
|
30
35
|
)
|
|
31
36
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
|
32
37
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
@@ -34,7 +39,8 @@ from metaflow.sidecar import Sidecar
|
|
|
34
39
|
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
35
40
|
|
|
36
41
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
37
|
-
from .kubernetes import KubernetesException
|
|
42
|
+
from .kubernetes import KubernetesException
|
|
43
|
+
from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
|
|
38
44
|
|
|
39
45
|
from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
|
|
40
46
|
|
|
@@ -44,6 +50,8 @@ except NameError:
|
|
|
44
50
|
unicode = str
|
|
45
51
|
basestring = str
|
|
46
52
|
|
|
53
|
+
SUPPORTED_KUBERNETES_QOS_CLASSES = ["Guaranteed", "Burstable"]
|
|
54
|
+
|
|
47
55
|
|
|
48
56
|
class KubernetesDecorator(StepDecorator):
|
|
49
57
|
"""
|
|
@@ -68,12 +76,21 @@ class KubernetesDecorator(StepDecorator):
|
|
|
68
76
|
not, a default Docker image mapping to the current version of Python is used.
|
|
69
77
|
image_pull_policy: str, default KUBERNETES_IMAGE_PULL_POLICY
|
|
70
78
|
If given, the imagePullPolicy to be applied to the Docker image of the step.
|
|
79
|
+
image_pull_secrets: List[str], default []
|
|
80
|
+
The default is extracted from METAFLOW_KUBERNETES_IMAGE_PULL_SECRETS.
|
|
81
|
+
Kubernetes image pull secrets to use when pulling container images
|
|
82
|
+
in Kubernetes.
|
|
71
83
|
service_account : str, default METAFLOW_KUBERNETES_SERVICE_ACCOUNT
|
|
72
84
|
Kubernetes service account to use when launching pod in Kubernetes.
|
|
73
85
|
secrets : List[str], optional, default None
|
|
74
86
|
Kubernetes secrets to use when launching pod in Kubernetes. These
|
|
75
87
|
secrets are in addition to the ones defined in `METAFLOW_KUBERNETES_SECRETS`
|
|
76
88
|
in Metaflow configuration.
|
|
89
|
+
node_selector: Union[Dict[str,str], str], optional, default None
|
|
90
|
+
Kubernetes node selector(s) to apply to the pod running the task.
|
|
91
|
+
Can be passed in as a comma separated string of values e.g.
|
|
92
|
+
'kubernetes.io/os=linux,kubernetes.io/arch=amd64' or as a dictionary
|
|
93
|
+
{'kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64'}
|
|
77
94
|
namespace : str, default METAFLOW_KUBERNETES_NAMESPACE
|
|
78
95
|
Kubernetes namespace to use when launching pod in Kubernetes.
|
|
79
96
|
gpu : int, optional, default None
|
|
@@ -81,9 +98,13 @@ class KubernetesDecorator(StepDecorator):
|
|
|
81
98
|
the scheduled node should not have GPUs.
|
|
82
99
|
gpu_vendor : str, default KUBERNETES_GPU_VENDOR
|
|
83
100
|
The vendor of the GPUs to be used for this step.
|
|
84
|
-
tolerations : List[str], default []
|
|
101
|
+
tolerations : List[Dict[str,str]], default []
|
|
85
102
|
The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
|
|
86
103
|
Kubernetes tolerations to use when launching pod in Kubernetes.
|
|
104
|
+
labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
|
|
105
|
+
Kubernetes labels to use when launching pod in Kubernetes.
|
|
106
|
+
annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
|
|
107
|
+
Kubernetes annotations to use when launching pod in Kubernetes.
|
|
87
108
|
use_tmpfs : bool, default False
|
|
88
109
|
This enables an explicit tmpfs mount for this step.
|
|
89
110
|
tmpfs_tempdir : bool, default True
|
|
@@ -101,6 +122,22 @@ class KubernetesDecorator(StepDecorator):
|
|
|
101
122
|
Shared memory size (in MiB) required for this step
|
|
102
123
|
port: int, optional
|
|
103
124
|
Port number to specify in the Kubernetes job object
|
|
125
|
+
compute_pool : str, optional, default None
|
|
126
|
+
Compute pool to be used for for this step.
|
|
127
|
+
If not specified, any accessible compute pool within the perimeter is used.
|
|
128
|
+
hostname_resolution_timeout: int, default 10 * 60
|
|
129
|
+
Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
|
|
130
|
+
Only applicable when @parallel is used.
|
|
131
|
+
qos: str, default: Burstable
|
|
132
|
+
Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
|
|
133
|
+
|
|
134
|
+
security_context: Dict[str, Any], optional, default None
|
|
135
|
+
Container security context. Applies to the task container. Allows the following keys:
|
|
136
|
+
- privileged: bool, optional, default None
|
|
137
|
+
- allow_privilege_escalation: bool, optional, default None
|
|
138
|
+
- run_as_user: int, optional, default None
|
|
139
|
+
- run_as_group: int, optional, default None
|
|
140
|
+
- run_as_non_root: bool, optional, default None
|
|
104
141
|
"""
|
|
105
142
|
|
|
106
143
|
name = "kubernetes"
|
|
@@ -110,6 +147,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
110
147
|
"disk": "10240",
|
|
111
148
|
"image": None,
|
|
112
149
|
"image_pull_policy": None,
|
|
150
|
+
"image_pull_secrets": None, # e.g., ["regcred"]
|
|
113
151
|
"service_account": None,
|
|
114
152
|
"secrets": None, # e.g., mysecret
|
|
115
153
|
"node_selector": None, # e.g., kubernetes.io/os=linux
|
|
@@ -118,6 +156,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
118
156
|
"gpu_vendor": None,
|
|
119
157
|
"tolerations": None, # e.g., [{"key": "arch", "operator": "Equal", "value": "amd"},
|
|
120
158
|
# {"key": "foo", "operator": "Equal", "value": "bar"}]
|
|
159
|
+
"labels": None, # e.g. {"test-label": "value", "another-label":"value2"}
|
|
160
|
+
"annotations": None, # e.g. {"note": "value", "another-note": "value2"}
|
|
121
161
|
"use_tmpfs": None,
|
|
122
162
|
"tmpfs_tempdir": True,
|
|
123
163
|
"tmpfs_size": None,
|
|
@@ -125,14 +165,22 @@ class KubernetesDecorator(StepDecorator):
|
|
|
125
165
|
"persistent_volume_claims": None, # e.g., {"pvc-name": "/mnt/vol", "another-pvc": "/mnt/vol2"}
|
|
126
166
|
"shared_memory": None,
|
|
127
167
|
"port": None,
|
|
168
|
+
"compute_pool": None,
|
|
169
|
+
"executable": None,
|
|
170
|
+
"hostname_resolution_timeout": 10 * 60,
|
|
171
|
+
"qos": KUBERNETES_QOS,
|
|
172
|
+
"security_context": None,
|
|
128
173
|
}
|
|
174
|
+
package_metadata = None
|
|
129
175
|
package_url = None
|
|
130
176
|
package_sha = None
|
|
131
177
|
run_time_limit = None
|
|
132
178
|
|
|
133
|
-
|
|
134
|
-
|
|
179
|
+
# Conda environment support
|
|
180
|
+
supports_conda_environment = True
|
|
181
|
+
target_platform = KUBERNETES_CONDA_ARCH or "linux-64"
|
|
135
182
|
|
|
183
|
+
def init(self):
|
|
136
184
|
if not self.attributes["namespace"]:
|
|
137
185
|
self.attributes["namespace"] = KUBERNETES_NAMESPACE
|
|
138
186
|
if not self.attributes["service_account"]:
|
|
@@ -152,11 +200,21 @@ class KubernetesDecorator(StepDecorator):
|
|
|
152
200
|
)
|
|
153
201
|
if not self.attributes["image_pull_policy"] and KUBERNETES_IMAGE_PULL_POLICY:
|
|
154
202
|
self.attributes["image_pull_policy"] = KUBERNETES_IMAGE_PULL_POLICY
|
|
203
|
+
if not self.attributes["image_pull_secrets"] and KUBERNETES_IMAGE_PULL_SECRETS:
|
|
204
|
+
self.attributes["image_pull_secrets"] = json.loads(
|
|
205
|
+
KUBERNETES_IMAGE_PULL_SECRETS
|
|
206
|
+
)
|
|
155
207
|
|
|
156
208
|
if isinstance(self.attributes["node_selector"], str):
|
|
157
209
|
self.attributes["node_selector"] = parse_kube_keyvalue_list(
|
|
158
210
|
self.attributes["node_selector"].split(",")
|
|
159
211
|
)
|
|
212
|
+
if self.attributes["compute_pool"]:
|
|
213
|
+
if self.attributes["node_selector"] is None:
|
|
214
|
+
self.attributes["node_selector"] = {}
|
|
215
|
+
self.attributes["node_selector"].update(
|
|
216
|
+
{"outerbounds.co/compute-pool": self.attributes["compute_pool"]}
|
|
217
|
+
)
|
|
160
218
|
|
|
161
219
|
if self.attributes["tolerations"]:
|
|
162
220
|
try:
|
|
@@ -190,6 +248,36 @@ class KubernetesDecorator(StepDecorator):
|
|
|
190
248
|
self.attributes["memory"] = KUBERNETES_MEMORY
|
|
191
249
|
if self.attributes["disk"] == self.defaults["disk"] and KUBERNETES_DISK:
|
|
192
250
|
self.attributes["disk"] = KUBERNETES_DISK
|
|
251
|
+
# Label source precedence (decreasing):
|
|
252
|
+
# - System labels (set outside of decorator)
|
|
253
|
+
# - Decorator labels: @kubernetes(labels={})
|
|
254
|
+
# - Environment variable labels: METAFLOW_KUBERNETES_LABELS=
|
|
255
|
+
deco_labels = {}
|
|
256
|
+
if self.attributes["labels"] is not None:
|
|
257
|
+
deco_labels = self.attributes["labels"]
|
|
258
|
+
|
|
259
|
+
env_labels = {}
|
|
260
|
+
if KUBERNETES_LABELS:
|
|
261
|
+
env_labels = parse_kube_keyvalue_list(KUBERNETES_LABELS.split(","), False)
|
|
262
|
+
|
|
263
|
+
self.attributes["labels"] = {**env_labels, **deco_labels}
|
|
264
|
+
|
|
265
|
+
# Annotations
|
|
266
|
+
# annotation precedence (decreasing):
|
|
267
|
+
# - System annotations (set outside of decorator)
|
|
268
|
+
# - Decorator annotations: @kubernetes(annotations={})
|
|
269
|
+
# - Environment annotations: METAFLOW_KUBERNETES_ANNOTATIONS=
|
|
270
|
+
deco_annotations = {}
|
|
271
|
+
if self.attributes["annotations"] is not None:
|
|
272
|
+
deco_annotations = self.attributes["annotations"]
|
|
273
|
+
|
|
274
|
+
env_annotations = {}
|
|
275
|
+
if KUBERNETES_ANNOTATIONS:
|
|
276
|
+
env_annotations = parse_kube_keyvalue_list(
|
|
277
|
+
KUBERNETES_ANNOTATIONS.split(","), False
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
self.attributes["annotations"] = {**env_annotations, **deco_annotations}
|
|
193
281
|
|
|
194
282
|
# If no docker image is explicitly specified, impute a default image.
|
|
195
283
|
if not self.attributes["image"]:
|
|
@@ -238,12 +326,33 @@ class KubernetesDecorator(StepDecorator):
|
|
|
238
326
|
self.step = step
|
|
239
327
|
self.flow_datastore = flow_datastore
|
|
240
328
|
|
|
329
|
+
if (
|
|
330
|
+
self.attributes["qos"] is not None
|
|
331
|
+
# case insensitive matching.
|
|
332
|
+
and self.attributes["qos"].lower()
|
|
333
|
+
not in [c.lower() for c in SUPPORTED_KUBERNETES_QOS_CLASSES]
|
|
334
|
+
):
|
|
335
|
+
raise MetaflowException(
|
|
336
|
+
"*%s* is not a valid Kubernetes QoS class. Choose one of the following: %s"
|
|
337
|
+
% (self.attributes["qos"], ", ".join(SUPPORTED_KUBERNETES_QOS_CLASSES))
|
|
338
|
+
)
|
|
339
|
+
|
|
241
340
|
if any([deco.name == "batch" for deco in decos]):
|
|
242
341
|
raise MetaflowException(
|
|
243
342
|
"Step *{step}* is marked for execution both on AWS Batch and "
|
|
244
343
|
"Kubernetes. Please use one or the other.".format(step=step)
|
|
245
344
|
)
|
|
246
345
|
|
|
346
|
+
if any([deco.name == "parallel" for deco in decos]) and any(
|
|
347
|
+
[deco.name == "catch" for deco in decos]
|
|
348
|
+
):
|
|
349
|
+
raise MetaflowException(
|
|
350
|
+
"Step *{step}* contains a @parallel decorator "
|
|
351
|
+
"with the @catch decorator. @catch is not supported with @parallel on Kubernetes.".format(
|
|
352
|
+
step=step
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
|
|
247
356
|
# Set run time limit for the Kubernetes job.
|
|
248
357
|
self.run_time_limit = get_run_time_limit_for_task(decos)
|
|
249
358
|
if self.run_time_limit < 60:
|
|
@@ -327,7 +436,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
327
436
|
|
|
328
437
|
if self.attributes["shared_memory"]:
|
|
329
438
|
if not (
|
|
330
|
-
isinstance(self.attributes["shared_memory"],
|
|
439
|
+
isinstance(self.attributes["shared_memory"], int)
|
|
331
440
|
and int(self.attributes["shared_memory"]) > 0
|
|
332
441
|
):
|
|
333
442
|
raise KubernetesException(
|
|
@@ -336,6 +445,9 @@ class KubernetesDecorator(StepDecorator):
|
|
|
336
445
|
)
|
|
337
446
|
)
|
|
338
447
|
|
|
448
|
+
validate_kube_labels(self.attributes["labels"])
|
|
449
|
+
# TODO: add validation to annotations as well?
|
|
450
|
+
|
|
339
451
|
def package_init(self, flow, step_name, environment):
|
|
340
452
|
try:
|
|
341
453
|
# Kubernetes is a soft dependency.
|
|
@@ -374,12 +486,17 @@ class KubernetesDecorator(StepDecorator):
|
|
|
374
486
|
# to execute on Kubernetes anymore. We can execute possible fallback
|
|
375
487
|
# code locally.
|
|
376
488
|
cli_args.commands = ["kubernetes", "step"]
|
|
489
|
+
cli_args.command_args.append(self.package_metadata)
|
|
377
490
|
cli_args.command_args.append(self.package_sha)
|
|
378
491
|
cli_args.command_args.append(self.package_url)
|
|
379
492
|
|
|
493
|
+
# skip certain keys as CLI arguments
|
|
494
|
+
_skip_keys = ["compute_pool", "hostname_resolution_timeout"]
|
|
380
495
|
# --namespace is used to specify Metaflow namespace (a different
|
|
381
496
|
# concept from k8s namespace).
|
|
382
497
|
for k, v in self.attributes.items():
|
|
498
|
+
if k in _skip_keys:
|
|
499
|
+
continue
|
|
383
500
|
if k == "namespace":
|
|
384
501
|
cli_args.command_options["k8s_namespace"] = v
|
|
385
502
|
elif k in {"node_selector"} and v:
|
|
@@ -387,7 +504,14 @@ class KubernetesDecorator(StepDecorator):
|
|
|
387
504
|
"=".join([key, str(val)]) if val else key
|
|
388
505
|
for key, val in v.items()
|
|
389
506
|
]
|
|
390
|
-
elif k in [
|
|
507
|
+
elif k in [
|
|
508
|
+
"image_pull_secrets",
|
|
509
|
+
"tolerations",
|
|
510
|
+
"persistent_volume_claims",
|
|
511
|
+
"labels",
|
|
512
|
+
"annotations",
|
|
513
|
+
"security_context",
|
|
514
|
+
]:
|
|
391
515
|
cli_args.command_options[k] = json.dumps(v)
|
|
392
516
|
else:
|
|
393
517
|
cli_args.command_options[k] = v
|
|
@@ -422,8 +546,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
422
546
|
# check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
|
|
423
547
|
# variable.
|
|
424
548
|
|
|
549
|
+
meta = {}
|
|
425
550
|
if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
|
|
426
|
-
meta = {}
|
|
427
551
|
meta["kubernetes-pod-name"] = os.environ["METAFLOW_KUBERNETES_POD_NAME"]
|
|
428
552
|
meta["kubernetes-pod-namespace"] = os.environ[
|
|
429
553
|
"METAFLOW_KUBERNETES_POD_NAMESPACE"
|
|
@@ -434,10 +558,14 @@ class KubernetesDecorator(StepDecorator):
|
|
|
434
558
|
]
|
|
435
559
|
meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
|
|
436
560
|
|
|
561
|
+
meta["kubernetes-jobset-name"] = os.environ.get(
|
|
562
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME"
|
|
563
|
+
)
|
|
564
|
+
|
|
437
565
|
# TODO (savin): Introduce equivalent support for Microsoft Azure and
|
|
438
566
|
# Google Cloud Platform
|
|
439
|
-
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
|
440
|
-
# can be avoided by not having to try out all providers.
|
|
567
|
+
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
|
568
|
+
# (and delays) can be avoided by not having to try out all providers.
|
|
441
569
|
if KUBERNETES_FETCH_EC2_METADATA:
|
|
442
570
|
instance_meta = get_ec2_instance_metadata()
|
|
443
571
|
meta.update(instance_meta)
|
|
@@ -453,38 +581,51 @@ class KubernetesDecorator(StepDecorator):
|
|
|
453
581
|
# "METAFLOW_KUBERNETES_POD_NAME"
|
|
454
582
|
# ].rpartition("-")[0]
|
|
455
583
|
|
|
456
|
-
entries = [
|
|
457
|
-
MetaDatum(field=k, value=v, type=k, tags=[])
|
|
458
|
-
for k, v in meta.items()
|
|
459
|
-
if v is not None
|
|
460
|
-
]
|
|
461
|
-
# Register book-keeping metadata for debugging.
|
|
462
|
-
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
463
|
-
|
|
464
584
|
# Start MFLog sidecar to collect task logs.
|
|
465
585
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
466
586
|
self._save_logs_sidecar.start()
|
|
467
587
|
|
|
468
|
-
|
|
469
|
-
|
|
588
|
+
# Start spot termination monitor sidecar.
|
|
589
|
+
current._update_env(
|
|
590
|
+
{"spot_termination_notice": "/tmp/spot_termination_notice"}
|
|
591
|
+
)
|
|
592
|
+
self._spot_monitor_sidecar = Sidecar("spot_termination_monitor")
|
|
593
|
+
self._spot_monitor_sidecar.start()
|
|
594
|
+
|
|
595
|
+
num_parallel = None
|
|
596
|
+
if hasattr(flow, "_parallel_ubf_iter"):
|
|
597
|
+
num_parallel = flow._parallel_ubf_iter.num_parallel
|
|
598
|
+
|
|
599
|
+
if num_parallel and num_parallel > 1:
|
|
600
|
+
_setup_multinode_environment(
|
|
601
|
+
ubf_context, self.attributes["hostname_resolution_timeout"]
|
|
602
|
+
)
|
|
603
|
+
# current.parallel.node_index will be correctly available over here.
|
|
604
|
+
meta.update({"parallel-node-index": current.parallel.node_index})
|
|
470
605
|
if ubf_context == UBF_CONTROL:
|
|
471
|
-
control_task_id = current.task_id
|
|
472
|
-
top_task_id = control_task_id.replace("control-", "")
|
|
473
|
-
mapper_task_ids = [control_task_id] + [
|
|
474
|
-
"%s-node-%d" % (top_task_id, node_idx)
|
|
475
|
-
for node_idx in range(1, num_parallel)
|
|
476
|
-
]
|
|
477
606
|
flow._control_mapper_tasks = [
|
|
478
|
-
"
|
|
479
|
-
for
|
|
607
|
+
"{}/{}/{}".format(run_id, step_name, task_id)
|
|
608
|
+
for task_id in [task_id]
|
|
609
|
+
+ [
|
|
610
|
+
"%s-worker-%d" % (task_id, idx)
|
|
611
|
+
for idx in range(num_parallel - 1)
|
|
612
|
+
]
|
|
480
613
|
]
|
|
481
614
|
flow._control_task_is_mapper_zero = True
|
|
482
|
-
else:
|
|
483
|
-
worker_job_rank = int(os.environ["RANK"])
|
|
484
|
-
os.environ["RANK"] = str(worker_job_rank + 1)
|
|
485
615
|
|
|
486
|
-
if
|
|
487
|
-
|
|
616
|
+
if len(meta) > 0:
|
|
617
|
+
entries = [
|
|
618
|
+
MetaDatum(
|
|
619
|
+
field=k,
|
|
620
|
+
value=v,
|
|
621
|
+
type=k,
|
|
622
|
+
tags=["attempt_id:{0}".format(retry_count)],
|
|
623
|
+
)
|
|
624
|
+
for k, v in meta.items()
|
|
625
|
+
if v is not None
|
|
626
|
+
]
|
|
627
|
+
# Register book-keeping metadata for debugging.
|
|
628
|
+
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
488
629
|
|
|
489
630
|
def task_finished(
|
|
490
631
|
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
|
@@ -497,10 +638,10 @@ class KubernetesDecorator(StepDecorator):
|
|
|
497
638
|
# local file system after the user code has finished execution.
|
|
498
639
|
# This happens via datastore as a communication bridge.
|
|
499
640
|
|
|
500
|
-
# TODO: There is no guarantee that
|
|
501
|
-
# task_finished is invoked.
|
|
502
|
-
#
|
|
503
|
-
if self.metadata.TYPE == "local":
|
|
641
|
+
# TODO: There is no guarantee that task_pre_step executes before
|
|
642
|
+
# task_finished is invoked.
|
|
643
|
+
# For now we guard against the missing metadata object in this case.
|
|
644
|
+
if hasattr(self, "metadata") and self.metadata.TYPE == "local":
|
|
504
645
|
# Note that the datastore is *always* Amazon S3 (see
|
|
505
646
|
# runtime_task_created function).
|
|
506
647
|
sync_local_metadata_to_datastore(
|
|
@@ -509,57 +650,74 @@ class KubernetesDecorator(StepDecorator):
|
|
|
509
650
|
|
|
510
651
|
try:
|
|
511
652
|
self._save_logs_sidecar.terminate()
|
|
653
|
+
self._spot_monitor_sidecar.terminate()
|
|
512
654
|
except:
|
|
513
655
|
# Best effort kill
|
|
514
656
|
pass
|
|
515
657
|
|
|
516
|
-
|
|
517
|
-
|
|
658
|
+
@classmethod
|
|
659
|
+
def _save_package_once(cls, flow_datastore, package):
|
|
660
|
+
if cls.package_url is None:
|
|
661
|
+
if not FEAT_ALWAYS_UPLOAD_CODE_PACKAGE:
|
|
662
|
+
cls.package_url, cls.package_sha = flow_datastore.save_data(
|
|
663
|
+
[package.blob], len_hint=1
|
|
664
|
+
)[0]
|
|
665
|
+
cls.package_metadata = package.package_metadata
|
|
666
|
+
else:
|
|
667
|
+
# Blocks until the package is uploaded
|
|
668
|
+
cls.package_url = package.package_url()
|
|
669
|
+
cls.package_sha = package.package_sha()
|
|
670
|
+
cls.package_metadata = package.package_metadata
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
# TODO: Unify this method with the multi-node setup in @batch
|
|
674
|
+
def _setup_multinode_environment(ubf_context, hostname_resolution_timeout):
|
|
675
|
+
import socket
|
|
518
676
|
|
|
519
|
-
def
|
|
677
|
+
def _wait_for_hostname_resolution(max_wait_timeout=10 * 60):
|
|
520
678
|
"""
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
679
|
+
keep trying to resolve the hostname of the control task until the hostname is resolved
|
|
680
|
+
or the max_wait_timeout is reached. This is a workaround for the issue where the control
|
|
681
|
+
task is not scheduled before the worker task and the worker task fails because it cannot
|
|
682
|
+
resolve the hostname of the control task.
|
|
525
683
|
"""
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
TIMEOUT = 600
|
|
529
|
-
last_completion_timeout = time.time() + TIMEOUT
|
|
530
|
-
print("Waiting for batch secondary tasks to finish")
|
|
531
|
-
while last_completion_timeout > time.time():
|
|
532
|
-
time.sleep(2)
|
|
684
|
+
start_time = time.time()
|
|
685
|
+
while True:
|
|
533
686
|
try:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
if
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
return True
|
|
541
|
-
else:
|
|
542
|
-
print(
|
|
543
|
-
"Waiting for all parallel tasks to finish. Finished: {}/{}".format(
|
|
544
|
-
len(tasks),
|
|
545
|
-
len(flow._control_mapper_tasks),
|
|
687
|
+
return socket.gethostbyname(os.environ["MF_MASTER_ADDR"])
|
|
688
|
+
except socket.gaierror:
|
|
689
|
+
if time.time() - start_time > max_wait_timeout:
|
|
690
|
+
raise MetaflowException(
|
|
691
|
+
"Failed to get host by name for MF_MASTER_ADDR after waiting for {} seconds.".format(
|
|
692
|
+
max_wait_timeout
|
|
546
693
|
)
|
|
547
694
|
)
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
if
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
)
|
|
695
|
+
time.sleep(1)
|
|
696
|
+
|
|
697
|
+
try:
|
|
698
|
+
# Even if Kubernetes may deploy control pods before worker pods, there is always a
|
|
699
|
+
# possibility that the worker pods may start before the control. In the case that this happens,
|
|
700
|
+
# the worker pods will not be able to resolve the control pod's IP address and this will cause
|
|
701
|
+
# the worker pods to fail. So if the worker pods are requesting a hostname resolution, we will
|
|
702
|
+
# make it wait for the name to be resolved within a reasonable timeout period.
|
|
703
|
+
if ubf_context != UBF_CONTROL:
|
|
704
|
+
os.environ["MF_PARALLEL_MAIN_IP"] = _wait_for_hostname_resolution(
|
|
705
|
+
hostname_resolution_timeout
|
|
706
|
+
)
|
|
707
|
+
else:
|
|
708
|
+
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(
|
|
709
|
+
os.environ["MF_MASTER_ADDR"]
|
|
710
|
+
)
|
|
560
711
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
712
|
+
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["MF_WORLD_SIZE"]
|
|
713
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = (
|
|
714
|
+
str(0)
|
|
715
|
+
if "MF_CONTROL_INDEX" in os.environ
|
|
716
|
+
else str(int(os.environ["MF_WORKER_REPLICA_INDEX"]) + 1)
|
|
717
|
+
)
|
|
718
|
+
except KeyError as e:
|
|
719
|
+
raise MetaflowException("Environment variable {} is missing.".format(e))
|
|
720
|
+
except socket.gaierror:
|
|
721
|
+
raise MetaflowException("Failed to get host by name for MF_MASTER_ADDR.")
|
|
722
|
+
except ValueError:
|
|
723
|
+
raise MetaflowException("Invalid value for MF_WORKER_REPLICA_INDEX.")
|