ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/R.py +10 -7
- metaflow/__init__.py +40 -25
- metaflow/_vendor/imghdr/__init__.py +186 -0
- metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/typeguard/__init__.py +48 -0
- metaflow/_vendor/typeguard/_checkers.py +1070 -0
- metaflow/_vendor/typeguard/_config.py +108 -0
- metaflow/_vendor/typeguard/_decorators.py +233 -0
- metaflow/_vendor/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/typeguard/_functions.py +308 -0
- metaflow/_vendor/typeguard/_importhook.py +213 -0
- metaflow/_vendor/typeguard/_memo.py +48 -0
- metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
- metaflow/_vendor/typeguard/_suppression.py +86 -0
- metaflow/_vendor/typeguard/_transformer.py +1229 -0
- metaflow/_vendor/typeguard/_union_transformer.py +55 -0
- metaflow/_vendor/typeguard/_utils.py +173 -0
- metaflow/_vendor/typeguard/py.typed +0 -0
- metaflow/_vendor/typing_extensions.py +3641 -0
- metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
- metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
- metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
- metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
- metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
- metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
- metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
- metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
- metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
- metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
- metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
- metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
- metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
- metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
- metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
- metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
- metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
- metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
- metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
- metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
- metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
- metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
- metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
- metaflow/_vendor/yaml/__init__.py +427 -0
- metaflow/_vendor/yaml/composer.py +139 -0
- metaflow/_vendor/yaml/constructor.py +748 -0
- metaflow/_vendor/yaml/cyaml.py +101 -0
- metaflow/_vendor/yaml/dumper.py +62 -0
- metaflow/_vendor/yaml/emitter.py +1137 -0
- metaflow/_vendor/yaml/error.py +75 -0
- metaflow/_vendor/yaml/events.py +86 -0
- metaflow/_vendor/yaml/loader.py +63 -0
- metaflow/_vendor/yaml/nodes.py +49 -0
- metaflow/_vendor/yaml/parser.py +589 -0
- metaflow/_vendor/yaml/reader.py +185 -0
- metaflow/_vendor/yaml/representer.py +389 -0
- metaflow/_vendor/yaml/resolver.py +227 -0
- metaflow/_vendor/yaml/scanner.py +1435 -0
- metaflow/_vendor/yaml/serializer.py +111 -0
- metaflow/_vendor/yaml/tokens.py +104 -0
- metaflow/cards.py +5 -0
- metaflow/cli.py +331 -785
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +52 -0
- metaflow/cli_components/run_cmds.py +546 -0
- metaflow/cli_components/step_cmd.py +334 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +467 -73
- metaflow/client/filecache.py +75 -35
- metaflow/clone_util.py +7 -1
- metaflow/cmd/code/__init__.py +231 -0
- metaflow/cmd/develop/stub_generator.py +756 -288
- metaflow/cmd/develop/stubs.py +12 -28
- metaflow/cmd/main_cli.py +6 -4
- metaflow/cmd/make_wrapper.py +78 -0
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +41 -10
- metaflow/datastore/datastore_set.py +11 -2
- metaflow/datastore/flow_datastore.py +156 -10
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +154 -39
- metaflow/debug.py +5 -0
- metaflow/decorators.py +404 -78
- metaflow/exception.py +8 -2
- metaflow/extension_support/__init__.py +527 -376
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/extension_support/plugins.py +49 -31
- metaflow/flowspec.py +482 -33
- metaflow/graph.py +210 -42
- metaflow/includefile.py +84 -40
- metaflow/lint.py +141 -22
- metaflow/meta_files.py +13 -0
- metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
- metaflow/{metadata → metadata_provider}/metadata.py +86 -1
- metaflow/metaflow_config.py +175 -28
- metaflow/metaflow_config_funcs.py +51 -3
- metaflow/metaflow_current.py +4 -10
- metaflow/metaflow_environment.py +139 -53
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_profile.py +18 -0
- metaflow/metaflow_version.py +150 -66
- metaflow/mflog/__init__.py +4 -3
- metaflow/mflog/save_logs.py +2 -2
- metaflow/multicore_utils.py +31 -14
- metaflow/package/__init__.py +673 -0
- metaflow/packaging_sys/__init__.py +880 -0
- metaflow/packaging_sys/backend.py +128 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +99 -0
- metaflow/packaging_sys/utils.py +54 -0
- metaflow/packaging_sys/v1.py +527 -0
- metaflow/parameters.py +149 -28
- metaflow/plugins/__init__.py +74 -5
- metaflow/plugins/airflow/airflow.py +40 -25
- metaflow/plugins/airflow/airflow_cli.py +22 -5
- metaflow/plugins/airflow/airflow_decorator.py +1 -1
- metaflow/plugins/airflow/airflow_utils.py +5 -3
- metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
- metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
- metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
- metaflow/plugins/argo/argo_client.py +78 -33
- metaflow/plugins/argo/argo_events.py +6 -6
- metaflow/plugins/argo/argo_workflows.py +2410 -527
- metaflow/plugins/argo/argo_workflows_cli.py +571 -121
- metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
- metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
- metaflow/plugins/argo/capture_error.py +73 -0
- metaflow/plugins/argo/conditional_input_paths.py +35 -0
- metaflow/plugins/argo/exit_hooks.py +209 -0
- metaflow/plugins/argo/jobset_input_paths.py +15 -0
- metaflow/plugins/argo/param_val.py +19 -0
- metaflow/plugins/aws/aws_client.py +10 -3
- metaflow/plugins/aws/aws_utils.py +55 -2
- metaflow/plugins/aws/batch/batch.py +72 -5
- metaflow/plugins/aws/batch/batch_cli.py +33 -10
- metaflow/plugins/aws/batch/batch_client.py +4 -3
- metaflow/plugins/aws/batch/batch_decorator.py +102 -35
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +65 -8
- metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
- metaflow/plugins/azure/azure_exceptions.py +1 -1
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
- metaflow/plugins/azure/azure_tail.py +1 -1
- metaflow/plugins/azure/includefile_support.py +2 -0
- metaflow/plugins/cards/card_cli.py +66 -30
- metaflow/plugins/cards/card_creator.py +25 -1
- metaflow/plugins/cards/card_datastore.py +21 -49
- metaflow/plugins/cards/card_decorator.py +132 -8
- metaflow/plugins/cards/card_modules/basic.py +112 -17
- metaflow/plugins/cards/card_modules/bundle.css +1 -1
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
- metaflow/plugins/cards/card_modules/components.py +665 -28
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
- metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
- metaflow/plugins/cards/card_modules/main.css +1 -0
- metaflow/plugins/cards/card_modules/main.js +68 -49
- metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
- metaflow/plugins/cards/card_modules/test_cards.py +26 -12
- metaflow/plugins/cards/card_server.py +39 -14
- metaflow/plugins/cards/component_serializer.py +2 -9
- metaflow/plugins/cards/metadata.py +22 -0
- metaflow/plugins/catch_decorator.py +9 -0
- metaflow/plugins/datastores/azure_storage.py +10 -1
- metaflow/plugins/datastores/gs_storage.py +6 -2
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/local.py +2 -0
- metaflow/plugins/datatools/s3/s3.py +126 -75
- metaflow/plugins/datatools/s3/s3op.py +254 -121
- metaflow/plugins/env_escape/__init__.py +3 -3
- metaflow/plugins/env_escape/client_modules.py +102 -72
- metaflow/plugins/env_escape/server.py +7 -0
- metaflow/plugins/env_escape/stub.py +24 -5
- metaflow/plugins/events_decorator.py +343 -185
- metaflow/plugins/exit_hook/__init__.py +0 -0
- metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
- metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/gcp/gs_tail.py +10 -6
- metaflow/plugins/gcp/includefile_support.py +3 -0
- metaflow/plugins/kubernetes/kube_utils.py +108 -0
- metaflow/plugins/kubernetes/kubernetes.py +411 -130
- metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
- metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
- metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
- metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/logs_cli.py +359 -0
- metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
- metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/plugins/package_cli.py +36 -24
- metaflow/plugins/parallel_decorator.py +128 -11
- metaflow/plugins/parsers.py +16 -0
- metaflow/plugins/project_decorator.py +51 -5
- metaflow/plugins/pypi/bootstrap.py +357 -105
- metaflow/plugins/pypi/conda_decorator.py +82 -81
- metaflow/plugins/pypi/conda_environment.py +187 -52
- metaflow/plugins/pypi/micromamba.py +157 -47
- metaflow/plugins/pypi/parsers.py +268 -0
- metaflow/plugins/pypi/pip.py +88 -13
- metaflow/plugins/pypi/pypi_decorator.py +37 -1
- metaflow/plugins/pypi/utils.py +48 -2
- metaflow/plugins/resources_decorator.py +2 -2
- metaflow/plugins/secrets/__init__.py +3 -0
- metaflow/plugins/secrets/secrets_decorator.py +26 -181
- metaflow/plugins/secrets/secrets_func.py +49 -0
- metaflow/plugins/secrets/secrets_spec.py +101 -0
- metaflow/plugins/secrets/utils.py +74 -0
- metaflow/plugins/tag_cli.py +4 -7
- metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
- metaflow/plugins/timeout_decorator.py +3 -3
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +128 -0
- metaflow/plugins/uv/uv_environment.py +72 -0
- metaflow/procpoll.py +1 -1
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/__init__.py +0 -0
- metaflow/runner/click_api.py +717 -0
- metaflow/runner/deployer.py +470 -0
- metaflow/runner/deployer_impl.py +201 -0
- metaflow/runner/metaflow_runner.py +714 -0
- metaflow/runner/nbdeploy.py +132 -0
- metaflow/runner/nbrun.py +225 -0
- metaflow/runner/subprocess_manager.py +650 -0
- metaflow/runner/utils.py +335 -0
- metaflow/runtime.py +1078 -260
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/system/__init__.py +5 -0
- metaflow/system/system_logger.py +85 -0
- metaflow/system/system_monitor.py +108 -0
- metaflow/system/system_utils.py +19 -0
- metaflow/task.py +521 -225
- metaflow/tracing/__init__.py +7 -7
- metaflow/tracing/span_exporter.py +31 -38
- metaflow/tracing/tracing_modules.py +38 -43
- metaflow/tuple_util.py +27 -0
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_options.py +563 -0
- metaflow/user_configs/config_parameters.py +598 -0
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +512 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +264 -0
- metaflow/user_decorators/user_step_decorator.py +749 -0
- metaflow/util.py +243 -27
- metaflow/vendor.py +23 -7
- metaflow/version.py +1 -1
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
- ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
- ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
- ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
- metaflow/_vendor/v3_5/__init__.py +0 -1
- metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
- metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
- metaflow/package.py +0 -188
- ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
- ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
- /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
- /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
- /metaflow/{metadata → metadata_provider}/util.py +0 -0
- /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
- {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -2,18 +2,24 @@ import json
|
|
|
2
2
|
import math
|
|
3
3
|
import random
|
|
4
4
|
import time
|
|
5
|
-
import os
|
|
6
|
-
import socket
|
|
7
|
-
import copy
|
|
8
5
|
|
|
9
6
|
from metaflow.exception import MetaflowException
|
|
10
7
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
8
|
+
from metaflow.tracing import inject_tracing_vars
|
|
9
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
11
10
|
|
|
12
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
13
12
|
|
|
13
|
+
from .kube_utils import qos_requests_and_limits
|
|
14
|
+
from .kubernetes_jobsets import (
|
|
15
|
+
KubernetesJobSet,
|
|
16
|
+
) # We need this import for Kubernetes Client.
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
class KubernetesJobException(MetaflowException):
|
|
15
20
|
headline = "Kubernetes job error"
|
|
16
21
|
|
|
22
|
+
|
|
17
23
|
# Implements truncated exponential backoff from
|
|
18
24
|
# https://cloud.google.com/storage/docs/retry-strategy#exponential-backoff
|
|
19
25
|
def k8s_retry(deadline_seconds=60, max_backoff=32):
|
|
@@ -56,19 +62,7 @@ class KubernetesJob(object):
|
|
|
56
62
|
self._client = client
|
|
57
63
|
self._kwargs = kwargs
|
|
58
64
|
|
|
59
|
-
def
|
|
60
|
-
# A discerning eye would notice and question the choice of using the
|
|
61
|
-
# V1Job construct over the V1Pod construct given that we don't rely much
|
|
62
|
-
# on any of the V1Job semantics. The major reasons at the moment are -
|
|
63
|
-
# 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
|
|
64
|
-
# the eyes, although even that can be questioned.
|
|
65
|
-
# 2. AWS Step Functions, at the moment (Apr' 22) only supports
|
|
66
|
-
# executing Jobs and not Pods as part of it's publicly declared
|
|
67
|
-
# API. When we ship the AWS Step Functions integration with EKS,
|
|
68
|
-
# it will hopefully lessen our workload.
|
|
69
|
-
#
|
|
70
|
-
# Note: This implementation ensures that there is only one unique Pod
|
|
71
|
-
# (unique UID) per Metaflow task attempt.
|
|
65
|
+
def create_job_spec(self):
|
|
72
66
|
client = self._client.get()
|
|
73
67
|
|
|
74
68
|
# tmpfs variables
|
|
@@ -80,529 +74,139 @@ class KubernetesJob(object):
|
|
|
80
74
|
if self._kwargs["shared_memory"]
|
|
81
75
|
else None
|
|
82
76
|
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
subdomain = jobset_name
|
|
89
|
-
master_port = int(self._kwargs['port']) if self._kwargs['port'] else None
|
|
90
|
-
shared_memory = int(self._kwargs['shared_memory']) if self._kwargs['shared_memory'] else None
|
|
91
|
-
|
|
92
|
-
passwordless_ssh = self._kwargs["attrs"]["requires_passwordless_ssh"]
|
|
93
|
-
if passwordless_ssh:
|
|
94
|
-
passwordless_ssh_service_name = subdomain
|
|
95
|
-
passwordless_ssh_service_selector = {
|
|
96
|
-
"passwordless-ssh-jobset": "true"
|
|
97
|
-
}
|
|
98
|
-
else:
|
|
99
|
-
passwordless_ssh_service_name = None
|
|
100
|
-
passwordless_ssh_service_selector = {}
|
|
101
|
-
|
|
102
|
-
fqdn_suffix = "%s.svc.cluster.local" % self._kwargs["namespace"]
|
|
103
|
-
jobset_main_addr = "%s-%s-%s-%s.%s.%s" % (
|
|
104
|
-
jobset_name,
|
|
105
|
-
main_job_name,
|
|
106
|
-
main_job_index,
|
|
107
|
-
main_pod_index,
|
|
108
|
-
subdomain,
|
|
109
|
-
fqdn_suffix,
|
|
77
|
+
qos_requests, qos_limits = qos_requests_and_limits(
|
|
78
|
+
self._kwargs["qos"],
|
|
79
|
+
self._kwargs["cpu"],
|
|
80
|
+
self._kwargs["memory"],
|
|
81
|
+
self._kwargs["disk"],
|
|
110
82
|
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
# TODO (Eddie): Remove this and suggest to user.
|
|
118
|
-
|
|
119
|
-
import subprocess
|
|
120
|
-
import tempfile
|
|
121
|
-
import shutil
|
|
122
|
-
import os
|
|
123
|
-
|
|
124
|
-
with open(os.devnull, "wb") as devnull:
|
|
125
|
-
cwd = os.getcwd()
|
|
126
|
-
tmp_dir = tempfile.mkdtemp()
|
|
127
|
-
os.chdir(tmp_dir)
|
|
128
|
-
subprocess.check_call(
|
|
129
|
-
["git", "clone", repo_url], stdout=devnull, stderr=subprocess.STDOUT
|
|
130
|
-
)
|
|
131
|
-
tmp_python_sdk_path = os.path.join(tmp_dir, python_sdk_path)
|
|
132
|
-
os.chdir(tmp_python_sdk_path)
|
|
133
|
-
subprocess.check_call(
|
|
134
|
-
["pip", "install", "."], stdout=devnull, stderr=subprocess.STDOUT
|
|
135
|
-
)
|
|
136
|
-
os.chdir(cwd)
|
|
137
|
-
shutil.rmtree(tmp_dir)
|
|
138
|
-
|
|
139
|
-
def _get_passwordless_ssh_service():
|
|
140
|
-
|
|
141
|
-
return client.V1Service(
|
|
142
|
-
api_version="v1",
|
|
143
|
-
kind="Service",
|
|
144
|
-
metadata=client.V1ObjectMeta(
|
|
145
|
-
name=passwordless_ssh_service_name,
|
|
146
|
-
namespace=self._kwargs["namespace"]
|
|
147
|
-
),
|
|
148
|
-
spec=client.V1ServiceSpec(
|
|
149
|
-
cluster_ip="None",
|
|
150
|
-
internal_traffic_policy="Cluster",
|
|
151
|
-
ip_families=["IPv4"],
|
|
152
|
-
ip_family_policy="SingleStack",
|
|
153
|
-
selector=passwordless_ssh_service_selector,
|
|
154
|
-
session_affinity="None",
|
|
155
|
-
type="ClusterIP",
|
|
156
|
-
ports=[
|
|
157
|
-
client.V1ServicePort(
|
|
158
|
-
name="control",
|
|
159
|
-
port=22,
|
|
160
|
-
protocol="TCP",
|
|
161
|
-
target_port=22
|
|
162
|
-
)
|
|
163
|
-
]
|
|
83
|
+
initial_configs = init_config()
|
|
84
|
+
for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
|
|
85
|
+
if entry not in initial_configs:
|
|
86
|
+
raise KubernetesJobException(
|
|
87
|
+
f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
|
|
164
88
|
)
|
|
165
|
-
)
|
|
166
89
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
ttl_seconds_after_finished=7
|
|
181
|
-
* 60
|
|
182
|
-
* 60
|
|
183
|
-
* 24,
|
|
184
|
-
template=client.V1PodTemplateSpec(
|
|
185
|
-
metadata=client.V1ObjectMeta(
|
|
186
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
187
|
-
labels={
|
|
188
|
-
**self._kwargs.get("labels", {}),
|
|
189
|
-
**passwordless_ssh_service_selector, # TODO: necessary?
|
|
190
|
-
# TODO: cluster-name, app.kubernetes.io/name necessary?
|
|
191
|
-
},
|
|
192
|
-
namespace=self._kwargs["namespace"],
|
|
193
|
-
),
|
|
194
|
-
spec=client.V1PodSpec(
|
|
195
|
-
active_deadline_seconds=self._kwargs[
|
|
196
|
-
"timeout_in_seconds"
|
|
197
|
-
],
|
|
198
|
-
containers=[
|
|
199
|
-
client.V1Container(
|
|
200
|
-
command=command,
|
|
201
|
-
ports=[client.V1ContainerPort(container_port=master_port)] if master_port and job_name=="control" else [],
|
|
202
|
-
env=[
|
|
203
|
-
client.V1EnvVar(name=k, value=str(v))
|
|
204
|
-
for k, v in self._kwargs.get(
|
|
205
|
-
"environment_variables", {}
|
|
206
|
-
).items()
|
|
207
|
-
]
|
|
208
|
-
+ [
|
|
209
|
-
client.V1EnvVar(
|
|
210
|
-
name=k,
|
|
211
|
-
value_from=client.V1EnvVarSource(
|
|
212
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
213
|
-
field_path=str(v)
|
|
214
|
-
)
|
|
215
|
-
),
|
|
216
|
-
)
|
|
217
|
-
for k, v in {
|
|
218
|
-
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
219
|
-
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
220
|
-
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
221
|
-
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
222
|
-
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
223
|
-
}.items()
|
|
224
|
-
]
|
|
225
|
-
# Mimicking the AWS Batch Multinode env vars.
|
|
226
|
-
+ [
|
|
227
|
-
client.V1EnvVar(
|
|
228
|
-
name="MASTER_ADDR",
|
|
229
|
-
value=jobset_main_addr,
|
|
230
|
-
),
|
|
231
|
-
client.V1EnvVar(
|
|
232
|
-
name="MASTER_PORT",
|
|
233
|
-
value=str(master_port),
|
|
234
|
-
),
|
|
235
|
-
client.V1EnvVar(
|
|
236
|
-
name="RANK",
|
|
237
|
-
value_from=client.V1EnvVarSource(
|
|
238
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
239
|
-
field_path="metadata.annotations['batch.kubernetes.io/job-completion-index']"
|
|
240
|
-
)
|
|
241
|
-
),
|
|
242
|
-
),
|
|
243
|
-
client.V1EnvVar(
|
|
244
|
-
name="WORLD_SIZE",
|
|
245
|
-
value=str(self._kwargs["num_parallel"]),
|
|
246
|
-
),
|
|
247
|
-
client.V1EnvVar(
|
|
248
|
-
name="PYTHONUNBUFFERED",
|
|
249
|
-
value="0",
|
|
250
|
-
),
|
|
251
|
-
],
|
|
252
|
-
env_from=[
|
|
253
|
-
client.V1EnvFromSource(
|
|
254
|
-
secret_ref=client.V1SecretEnvSource(
|
|
255
|
-
name=str(k),
|
|
256
|
-
# optional=True
|
|
257
|
-
)
|
|
258
|
-
)
|
|
259
|
-
for k in list(
|
|
260
|
-
self._kwargs.get("secrets", [])
|
|
261
|
-
)
|
|
262
|
-
+ KUBERNETES_SECRETS.split(",")
|
|
263
|
-
if k
|
|
264
|
-
],
|
|
265
|
-
image=self._kwargs["image"],
|
|
266
|
-
image_pull_policy=self._kwargs[
|
|
267
|
-
"image_pull_policy"
|
|
268
|
-
],
|
|
269
|
-
name=self._kwargs["step_name"].replace(
|
|
270
|
-
"_", "-"
|
|
271
|
-
),
|
|
272
|
-
resources=client.V1ResourceRequirements(
|
|
273
|
-
requests={
|
|
274
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
275
|
-
"memory": "%sM"
|
|
276
|
-
% str(self._kwargs["memory"]),
|
|
277
|
-
"ephemeral-storage": "%sM"
|
|
278
|
-
% str(self._kwargs["disk"]),
|
|
279
|
-
},
|
|
280
|
-
limits={
|
|
281
|
-
"%s.com/gpu".lower()
|
|
282
|
-
% self._kwargs["gpu_vendor"]: str(
|
|
283
|
-
self._kwargs["gpu"]
|
|
284
|
-
)
|
|
285
|
-
for k in [0]
|
|
286
|
-
# Don't set GPU limits if gpu isn't specified.
|
|
287
|
-
if self._kwargs["gpu"] is not None
|
|
288
|
-
},
|
|
289
|
-
),
|
|
290
|
-
volume_mounts=(
|
|
291
|
-
[
|
|
292
|
-
client.V1VolumeMount(
|
|
293
|
-
mount_path=self._kwargs.get(
|
|
294
|
-
"tmpfs_path"
|
|
295
|
-
),
|
|
296
|
-
name="tmpfs-ephemeral-volume",
|
|
297
|
-
)
|
|
298
|
-
]
|
|
299
|
-
if tmpfs_enabled
|
|
300
|
-
else []
|
|
301
|
-
)
|
|
302
|
-
+ (
|
|
303
|
-
[
|
|
304
|
-
client.V1VolumeMount(
|
|
305
|
-
mount_path="/dev/shm",
|
|
306
|
-
name="dhsm"
|
|
307
|
-
)
|
|
308
|
-
]
|
|
309
|
-
if shared_memory else []
|
|
310
|
-
)
|
|
311
|
-
+ (
|
|
312
|
-
[
|
|
313
|
-
client.V1VolumeMount(
|
|
314
|
-
mount_path=path, name=claim
|
|
315
|
-
)
|
|
316
|
-
for claim, path in self._kwargs[
|
|
317
|
-
"persistent_volume_claims"
|
|
318
|
-
].items()
|
|
319
|
-
]
|
|
320
|
-
if self._kwargs["persistent_volume_claims"]
|
|
321
|
-
is not None
|
|
322
|
-
else []
|
|
323
|
-
),
|
|
324
|
-
)
|
|
325
|
-
],
|
|
326
|
-
node_selector=self._kwargs.get("node_selector"),
|
|
327
|
-
restart_policy="Never",
|
|
328
|
-
|
|
329
|
-
set_hostname_as_fqdn=True, # configure pod hostname as pod's FQDN
|
|
330
|
-
share_process_namespace=False, # default
|
|
331
|
-
subdomain=subdomain, # FQDN = <hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>
|
|
332
|
-
|
|
333
|
-
service_account_name=self._kwargs["service_account"],
|
|
334
|
-
termination_grace_period_seconds=0,
|
|
335
|
-
tolerations=[
|
|
336
|
-
client.V1Toleration(**toleration)
|
|
337
|
-
for toleration in self._kwargs.get("tolerations")
|
|
338
|
-
or []
|
|
339
|
-
],
|
|
340
|
-
volumes=(
|
|
341
|
-
[
|
|
342
|
-
client.V1Volume(
|
|
343
|
-
name="tmpfs-ephemeral-volume",
|
|
344
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
345
|
-
medium="Memory",
|
|
346
|
-
size_limit="{}Mi".format(tmpfs_size),
|
|
347
|
-
),
|
|
348
|
-
)
|
|
349
|
-
]
|
|
350
|
-
if tmpfs_enabled
|
|
351
|
-
else []
|
|
352
|
-
)
|
|
353
|
-
+ (
|
|
354
|
-
[
|
|
355
|
-
client.V1Volume(
|
|
356
|
-
name="dhsm",
|
|
357
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
358
|
-
medium="Memory",
|
|
359
|
-
size_limit="{}Mi".format(shared_memory),
|
|
360
|
-
)
|
|
361
|
-
)
|
|
362
|
-
]
|
|
363
|
-
if shared_memory else []
|
|
364
|
-
)
|
|
365
|
-
+ (
|
|
366
|
-
[
|
|
367
|
-
client.V1Volume(
|
|
368
|
-
name=claim,
|
|
369
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
370
|
-
claim_name=claim
|
|
371
|
-
),
|
|
372
|
-
)
|
|
373
|
-
for claim in self._kwargs[
|
|
374
|
-
"persistent_volume_claims"
|
|
375
|
-
].keys()
|
|
376
|
-
]
|
|
377
|
-
if self._kwargs["persistent_volume_claims"]
|
|
378
|
-
is not None
|
|
379
|
-
else []
|
|
380
|
-
),
|
|
381
|
-
),
|
|
382
|
-
),
|
|
383
|
-
),
|
|
384
|
-
),
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
|
|
388
|
-
|
|
389
|
-
try:
|
|
390
|
-
import jobset
|
|
391
|
-
except ImportError:
|
|
392
|
-
_install_jobset()
|
|
393
|
-
import jobset
|
|
394
|
-
|
|
395
|
-
main_commands = copy.copy(self._kwargs["command"])
|
|
396
|
-
main_commands[-1] = main_commands[-1].replace(
|
|
397
|
-
"[multinode-args]", "--split-index 0"
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
task_id = self._kwargs["attrs"]["metaflow.task_id"]
|
|
401
|
-
secondary_commands = copy.copy(self._kwargs["command"])
|
|
402
|
-
# RANK needs +1 because control node is not in the worker index group, yet we want global nodes.
|
|
403
|
-
# Technically, control and worker could be same replicated job type, but cleaner to separate for future use cases.
|
|
404
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
405
|
-
"[multinode-args]", "--split-index `expr $RANK + 1`"
|
|
406
|
-
)
|
|
407
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
408
|
-
"ubf_control", "ubf_task"
|
|
409
|
-
)
|
|
410
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
411
|
-
task_id,
|
|
412
|
-
task_id.replace("control-", "") + "-node-`expr $RANK + 1`",
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
if passwordless_ssh:
|
|
416
|
-
if not os.path.exists("/usr/sbin/sshd"):
|
|
417
|
-
raise KubernetesJobException(
|
|
418
|
-
"This @parallel decorator requires sshd to be installed in the container image."
|
|
419
|
-
"Please install OpenSSH."
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
# run sshd in background
|
|
423
|
-
main_commands[-1] = "/usr/sbin/sshd -D & %s" % main_commands[-1]
|
|
424
|
-
secondary_commands[-1] = "/usr/sbin/sshd -D & %s" % secondary_commands[-1]
|
|
425
|
-
|
|
426
|
-
replicated_jobs = [_get_replicated_job("control", 1, main_commands)]
|
|
427
|
-
if self._kwargs["num_parallel"] > 1:
|
|
428
|
-
replicated_jobs.append(
|
|
429
|
-
_get_replicated_job("worker", self._kwargs["num_parallel"] - 1, secondary_commands)
|
|
430
|
-
)
|
|
90
|
+
additional_obp_configs = {
|
|
91
|
+
"OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
|
|
92
|
+
"OBP_INTEGRATIONS_URL": initial_configs[
|
|
93
|
+
"OBP_INTEGRATIONS_URL"
|
|
94
|
+
],
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
security_context = self._kwargs.get("security_context", {})
|
|
98
|
+
_security_context = {}
|
|
99
|
+
if security_context is not None and len(security_context) > 0:
|
|
100
|
+
_security_context = {
|
|
101
|
+
"security_context": client.V1SecurityContext(**security_context)
|
|
102
|
+
}
|
|
431
103
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
104
|
+
return client.V1JobSpec(
|
|
105
|
+
# Retries are handled by Metaflow when it is responsible for
|
|
106
|
+
# executing the flow. The responsibility is moved to Kubernetes
|
|
107
|
+
# when Argo Workflows is responsible for the execution.
|
|
108
|
+
backoff_limit=self._kwargs.get("retries", 0),
|
|
109
|
+
completions=self._kwargs.get("completions", 1),
|
|
110
|
+
ttl_seconds_after_finished=7
|
|
111
|
+
* 60
|
|
112
|
+
* 60 # Remove job after a week. TODO: Make this configurable
|
|
113
|
+
* 24,
|
|
114
|
+
template=client.V1PodTemplateSpec(
|
|
435
115
|
metadata=client.V1ObjectMeta(
|
|
436
116
|
annotations=self._kwargs.get("annotations", {}),
|
|
437
117
|
labels=self._kwargs.get("labels", {}),
|
|
438
|
-
name=jobset_name,
|
|
439
118
|
namespace=self._kwargs["namespace"],
|
|
440
119
|
),
|
|
441
|
-
spec=
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
457
|
-
# While labels are for Kubernetes
|
|
458
|
-
labels=self._kwargs.get("labels", {}),
|
|
459
|
-
generate_name=self._kwargs["generate_name"],
|
|
460
|
-
namespace=self._kwargs["namespace"], # Defaults to `default`
|
|
461
|
-
),
|
|
462
|
-
spec=client.V1JobSpec(
|
|
463
|
-
# Retries are handled by Metaflow when it is responsible for
|
|
464
|
-
# executing the flow. The responsibility is moved to Kubernetes
|
|
465
|
-
# when Argo Workflows is responsible for the execution.
|
|
466
|
-
backoff_limit=self._kwargs.get("retries", 0),
|
|
467
|
-
completions=1, # A single non-indexed pod job
|
|
468
|
-
ttl_seconds_after_finished=7
|
|
469
|
-
* 60
|
|
470
|
-
* 60 # Remove job after a week. TODO: Make this configurable
|
|
471
|
-
* 24,
|
|
472
|
-
template=client.V1PodTemplateSpec(
|
|
473
|
-
metadata=client.V1ObjectMeta(
|
|
474
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
475
|
-
labels=self._kwargs.get("labels", {}),
|
|
476
|
-
namespace=self._kwargs["namespace"],
|
|
477
|
-
),
|
|
478
|
-
spec=client.V1PodSpec(
|
|
479
|
-
# Timeout is set on the pod and not the job (important!)
|
|
480
|
-
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
|
481
|
-
# TODO (savin): Enable affinities for GPU scheduling.
|
|
482
|
-
# affinity=?,
|
|
483
|
-
containers=[
|
|
484
|
-
client.V1Container(
|
|
485
|
-
command=self._kwargs["command"],
|
|
486
|
-
ports=[
|
|
487
|
-
client.V1ContainerPort(
|
|
488
|
-
container_port=int(self._kwargs["port"])
|
|
489
|
-
)
|
|
490
|
-
]
|
|
491
|
-
if "port" in self._kwargs and self._kwargs["port"]
|
|
492
|
-
else None,
|
|
493
|
-
env=[
|
|
494
|
-
client.V1EnvVar(name=k, value=str(v))
|
|
495
|
-
for k, v in self._kwargs.get(
|
|
496
|
-
"environment_variables", {}
|
|
497
|
-
).items()
|
|
498
|
-
]
|
|
499
|
-
# And some downward API magic. Add (key, value)
|
|
500
|
-
# pairs below to make pod metadata available
|
|
501
|
-
# within Kubernetes container.
|
|
502
|
-
+ [
|
|
503
|
-
client.V1EnvVar(
|
|
504
|
-
name=k,
|
|
505
|
-
value_from=client.V1EnvVarSource(
|
|
506
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
507
|
-
field_path=str(v)
|
|
508
|
-
)
|
|
509
|
-
),
|
|
510
|
-
)
|
|
511
|
-
for k, v in {
|
|
512
|
-
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
513
|
-
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
514
|
-
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
515
|
-
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
516
|
-
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
517
|
-
}.items()
|
|
518
|
-
],
|
|
519
|
-
env_from=[
|
|
520
|
-
client.V1EnvFromSource(
|
|
521
|
-
secret_ref=client.V1SecretEnvSource(
|
|
522
|
-
name=str(k),
|
|
523
|
-
# optional=True
|
|
524
|
-
)
|
|
525
|
-
)
|
|
526
|
-
for k in list(self._kwargs.get("secrets", []))
|
|
527
|
-
+ KUBERNETES_SECRETS.split(",")
|
|
528
|
-
if k
|
|
529
|
-
],
|
|
530
|
-
image=self._kwargs["image"],
|
|
531
|
-
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
532
|
-
name=self._kwargs["step_name"].replace("_", "-"),
|
|
533
|
-
resources=client.V1ResourceRequirements(
|
|
534
|
-
requests={
|
|
535
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
536
|
-
"memory": "%sM"
|
|
537
|
-
% str(self._kwargs["memory"]),
|
|
538
|
-
"ephemeral-storage": "%sM"
|
|
539
|
-
% str(self._kwargs["disk"]),
|
|
540
|
-
},
|
|
541
|
-
limits={
|
|
542
|
-
"%s.com/gpu".lower()
|
|
543
|
-
% self._kwargs["gpu_vendor"]: str(
|
|
544
|
-
self._kwargs["gpu"]
|
|
545
|
-
)
|
|
546
|
-
for k in [0]
|
|
547
|
-
# Don't set GPU limits if gpu isn't specified.
|
|
548
|
-
if self._kwargs["gpu"] is not None
|
|
549
|
-
},
|
|
550
|
-
),
|
|
551
|
-
volume_mounts=(
|
|
552
|
-
[
|
|
553
|
-
client.V1VolumeMount(
|
|
554
|
-
mount_path=self._kwargs.get(
|
|
555
|
-
"tmpfs_path"
|
|
556
|
-
),
|
|
557
|
-
name="tmpfs-ephemeral-volume",
|
|
558
|
-
)
|
|
559
|
-
]
|
|
560
|
-
if tmpfs_enabled
|
|
561
|
-
else []
|
|
120
|
+
spec=client.V1PodSpec(
|
|
121
|
+
# Timeout is set on the pod and not the job (important!)
|
|
122
|
+
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
|
123
|
+
# TODO (savin): Enable affinities for GPU scheduling.
|
|
124
|
+
# affinity=?,
|
|
125
|
+
containers=[
|
|
126
|
+
client.V1Container(
|
|
127
|
+
command=self._kwargs["command"],
|
|
128
|
+
termination_message_policy="FallbackToLogsOnError",
|
|
129
|
+
ports=(
|
|
130
|
+
[]
|
|
131
|
+
if self._kwargs["port"] is None
|
|
132
|
+
else [
|
|
133
|
+
client.V1ContainerPort(
|
|
134
|
+
container_port=int(self._kwargs["port"])
|
|
562
135
|
)
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
136
|
+
]
|
|
137
|
+
),
|
|
138
|
+
env=[
|
|
139
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
140
|
+
for k, v in self._kwargs.get(
|
|
141
|
+
"environment_variables", {}
|
|
142
|
+
).items()
|
|
143
|
+
]
|
|
144
|
+
# And some downward API magic. Add (key, value)
|
|
145
|
+
# pairs below to make pod metadata available
|
|
146
|
+
# within Kubernetes container.
|
|
147
|
+
+ [
|
|
148
|
+
client.V1EnvVar(
|
|
149
|
+
name=k,
|
|
150
|
+
value_from=client.V1EnvVarSource(
|
|
151
|
+
field_ref=client.V1ObjectFieldSelector(
|
|
152
|
+
field_path=str(v)
|
|
153
|
+
)
|
|
575
154
|
),
|
|
576
155
|
)
|
|
156
|
+
for k, v in {
|
|
157
|
+
"METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
|
|
158
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
159
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
160
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
161
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
162
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
163
|
+
}.items()
|
|
164
|
+
]
|
|
165
|
+
+ [
|
|
166
|
+
client.V1EnvVar(
|
|
167
|
+
name=k,
|
|
168
|
+
value=v,
|
|
169
|
+
)
|
|
170
|
+
for k, v in additional_obp_configs.items()
|
|
171
|
+
]
|
|
172
|
+
+ [
|
|
173
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
174
|
+
for k, v in inject_tracing_vars({}).items()
|
|
577
175
|
],
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
# and let Metaflow handle the retries.
|
|
589
|
-
restart_policy="Never",
|
|
590
|
-
service_account_name=self._kwargs["service_account"],
|
|
591
|
-
# Terminate the container immediately on SIGTERM
|
|
592
|
-
termination_grace_period_seconds=0,
|
|
593
|
-
tolerations=[
|
|
594
|
-
client.V1Toleration(**toleration)
|
|
595
|
-
for toleration in self._kwargs.get("tolerations") or []
|
|
176
|
+
env_from=[
|
|
177
|
+
client.V1EnvFromSource(
|
|
178
|
+
secret_ref=client.V1SecretEnvSource(
|
|
179
|
+
name=str(k),
|
|
180
|
+
# optional=True
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
for k in list(self._kwargs.get("secrets", []))
|
|
184
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
185
|
+
if k
|
|
596
186
|
],
|
|
597
|
-
|
|
187
|
+
image=self._kwargs["image"],
|
|
188
|
+
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
189
|
+
name=self._kwargs["step_name"].replace("_", "-"),
|
|
190
|
+
resources=client.V1ResourceRequirements(
|
|
191
|
+
requests=qos_requests,
|
|
192
|
+
limits={
|
|
193
|
+
**qos_limits,
|
|
194
|
+
**{
|
|
195
|
+
"%s.com/gpu".lower()
|
|
196
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
197
|
+
self._kwargs["gpu"]
|
|
198
|
+
)
|
|
199
|
+
for k in [0]
|
|
200
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
201
|
+
if self._kwargs["gpu"] is not None
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
),
|
|
205
|
+
volume_mounts=(
|
|
598
206
|
[
|
|
599
|
-
client.
|
|
207
|
+
client.V1VolumeMount(
|
|
208
|
+
mount_path=self._kwargs.get("tmpfs_path"),
|
|
600
209
|
name="tmpfs-ephemeral-volume",
|
|
601
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
602
|
-
medium="Memory",
|
|
603
|
-
# Add default unit as ours differs from Kubernetes default.
|
|
604
|
-
size_limit="{}Mi".format(tmpfs_size),
|
|
605
|
-
),
|
|
606
210
|
)
|
|
607
211
|
]
|
|
608
212
|
if tmpfs_enabled
|
|
@@ -610,24 +214,119 @@ class KubernetesJob(object):
|
|
|
610
214
|
)
|
|
611
215
|
+ (
|
|
612
216
|
[
|
|
613
|
-
client.
|
|
614
|
-
name=
|
|
615
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
616
|
-
claim_name=claim
|
|
617
|
-
),
|
|
217
|
+
client.V1VolumeMount(
|
|
218
|
+
mount_path="/dev/shm", name="dhsm"
|
|
618
219
|
)
|
|
619
|
-
|
|
220
|
+
]
|
|
221
|
+
if shared_memory
|
|
222
|
+
else []
|
|
223
|
+
)
|
|
224
|
+
+ (
|
|
225
|
+
[
|
|
226
|
+
client.V1VolumeMount(mount_path=path, name=claim)
|
|
227
|
+
for claim, path in self._kwargs[
|
|
620
228
|
"persistent_volume_claims"
|
|
621
|
-
].
|
|
229
|
+
].items()
|
|
622
230
|
]
|
|
623
231
|
if self._kwargs["persistent_volume_claims"] is not None
|
|
624
232
|
else []
|
|
625
233
|
),
|
|
626
|
-
|
|
627
|
-
)
|
|
234
|
+
**_security_context,
|
|
235
|
+
)
|
|
236
|
+
],
|
|
237
|
+
node_selector=self._kwargs.get("node_selector"),
|
|
238
|
+
image_pull_secrets=[
|
|
239
|
+
client.V1LocalObjectReference(secret)
|
|
240
|
+
for secret in self._kwargs.get("image_pull_secrets") or []
|
|
241
|
+
],
|
|
242
|
+
# TODO (savin): Support preemption policies
|
|
243
|
+
# preemption_policy=?,
|
|
244
|
+
#
|
|
245
|
+
# A Container in a Pod may fail for a number of
|
|
246
|
+
# reasons, such as because the process in it exited
|
|
247
|
+
# with a non-zero exit code, or the Container was
|
|
248
|
+
# killed due to OOM etc. If this happens, fail the pod
|
|
249
|
+
# and let Metaflow handle the retries.
|
|
250
|
+
restart_policy="Never",
|
|
251
|
+
service_account_name=self._kwargs["service_account"],
|
|
252
|
+
# Terminate the container immediately on SIGTERM
|
|
253
|
+
termination_grace_period_seconds=0,
|
|
254
|
+
tolerations=[
|
|
255
|
+
client.V1Toleration(**toleration)
|
|
256
|
+
for toleration in self._kwargs.get("tolerations") or []
|
|
257
|
+
],
|
|
258
|
+
volumes=(
|
|
259
|
+
[
|
|
260
|
+
client.V1Volume(
|
|
261
|
+
name="tmpfs-ephemeral-volume",
|
|
262
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
263
|
+
medium="Memory",
|
|
264
|
+
# Add default unit as ours differs from Kubernetes default.
|
|
265
|
+
size_limit="{}Mi".format(tmpfs_size),
|
|
266
|
+
),
|
|
267
|
+
)
|
|
268
|
+
]
|
|
269
|
+
if tmpfs_enabled
|
|
270
|
+
else []
|
|
271
|
+
)
|
|
272
|
+
+ (
|
|
273
|
+
[
|
|
274
|
+
client.V1Volume(
|
|
275
|
+
name="dhsm",
|
|
276
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
277
|
+
medium="Memory",
|
|
278
|
+
size_limit="{}Mi".format(shared_memory),
|
|
279
|
+
),
|
|
280
|
+
)
|
|
281
|
+
]
|
|
282
|
+
if shared_memory
|
|
283
|
+
else []
|
|
284
|
+
)
|
|
285
|
+
+ (
|
|
286
|
+
[
|
|
287
|
+
client.V1Volume(
|
|
288
|
+
name=claim,
|
|
289
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
290
|
+
claim_name=claim
|
|
291
|
+
),
|
|
292
|
+
)
|
|
293
|
+
for claim in self._kwargs["persistent_volume_claims"].keys()
|
|
294
|
+
]
|
|
295
|
+
if self._kwargs["persistent_volume_claims"] is not None
|
|
296
|
+
else []
|
|
628
297
|
),
|
|
629
298
|
),
|
|
630
|
-
)
|
|
299
|
+
),
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
def create(self):
|
|
303
|
+
# A discerning eye would notice and question the choice of using the
|
|
304
|
+
# V1Job construct over the V1Pod construct given that we don't rely much
|
|
305
|
+
# on any of the V1Job semantics. The major reasons at the moment are -
|
|
306
|
+
# 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
|
|
307
|
+
# the eyes, although even that can be questioned.
|
|
308
|
+
# 2. AWS Step Functions, at the moment (Apr' 22) only supports
|
|
309
|
+
# executing Jobs and not Pods as part of it's publicly declared
|
|
310
|
+
# API. When we ship the AWS Step Functions integration with EKS,
|
|
311
|
+
# it will hopefully lessen our workload.
|
|
312
|
+
#
|
|
313
|
+
# Note: This implementation ensures that there is only one unique Pod
|
|
314
|
+
# (unique UID) per Metaflow task attempt.
|
|
315
|
+
client = self._client.get()
|
|
316
|
+
|
|
317
|
+
self._job = client.V1Job(
|
|
318
|
+
api_version="batch/v1",
|
|
319
|
+
kind="Job",
|
|
320
|
+
metadata=client.V1ObjectMeta(
|
|
321
|
+
# Annotations are for humans
|
|
322
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
323
|
+
# While labels are for Kubernetes
|
|
324
|
+
labels=self._kwargs.get("labels", {}),
|
|
325
|
+
generate_name=self._kwargs["generate_name"],
|
|
326
|
+
namespace=self._kwargs["namespace"], # Defaults to `default`
|
|
327
|
+
),
|
|
328
|
+
spec=self.create_job_spec(),
|
|
329
|
+
)
|
|
631
330
|
return self
|
|
632
331
|
|
|
633
332
|
def execute(self):
|
|
@@ -638,53 +337,19 @@ class KubernetesJob(object):
|
|
|
638
337
|
# achieve the guarantees that we are seeking.
|
|
639
338
|
# https://github.com/kubernetes/enhancements/issues/1040
|
|
640
339
|
# Hopefully, we will be able to get creative with kube-batch
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
api_instance = client.CoreV1Api()
|
|
646
|
-
api_response = api_instance.create_namespaced_service(namespace=self._kwargs['namespace'], body=self._passwordless_ssh_service)
|
|
647
|
-
|
|
648
|
-
with client.ApiClient() as api_client:
|
|
649
|
-
api_instance = client.CustomObjectsApi(api_client)
|
|
650
|
-
|
|
651
|
-
response = api_instance.create_namespaced_custom_object(
|
|
652
|
-
body=self._jobset,
|
|
653
|
-
group="jobset.x-k8s.io",
|
|
654
|
-
version="v1alpha2",
|
|
655
|
-
namespace=self._kwargs["namespace"],
|
|
656
|
-
plural="jobsets",
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
# HACK: Give K8s some time to actually create the job
|
|
660
|
-
time.sleep(10)
|
|
661
|
-
|
|
662
|
-
# TODO (Eddie): Remove hack and make RunningJobSet.
|
|
663
|
-
# There are many jobs running that should be monitored.
|
|
664
|
-
job_name = "%s-control-0" % response["metadata"]["name"]
|
|
665
|
-
fake_id = 123
|
|
666
|
-
return RunningJob(
|
|
667
|
-
client=self._client,
|
|
668
|
-
name=job_name,
|
|
669
|
-
uid=fake_id,
|
|
670
|
-
namespace=response["metadata"]["namespace"],
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
else:
|
|
674
|
-
response = (
|
|
675
|
-
client.BatchV1Api()
|
|
676
|
-
.create_namespaced_job(
|
|
677
|
-
body=self._job, namespace=self._kwargs["namespace"]
|
|
678
|
-
)
|
|
679
|
-
.to_dict()
|
|
680
|
-
)
|
|
681
|
-
return RunningJob(
|
|
682
|
-
client=self._client,
|
|
683
|
-
name=response["metadata"]["name"],
|
|
684
|
-
uid=response["metadata"]["uid"],
|
|
685
|
-
namespace=response["metadata"]["namespace"],
|
|
340
|
+
response = (
|
|
341
|
+
client.BatchV1Api()
|
|
342
|
+
.create_namespaced_job(
|
|
343
|
+
body=self._job, namespace=self._kwargs["namespace"]
|
|
686
344
|
)
|
|
687
|
-
|
|
345
|
+
.to_dict()
|
|
346
|
+
)
|
|
347
|
+
return RunningJob(
|
|
348
|
+
client=self._client,
|
|
349
|
+
name=response["metadata"]["name"],
|
|
350
|
+
uid=response["metadata"]["uid"],
|
|
351
|
+
namespace=response["metadata"]["namespace"],
|
|
352
|
+
)
|
|
688
353
|
except client.rest.ApiException as e:
|
|
689
354
|
raise KubernetesJobException(
|
|
690
355
|
"Unable to launch Kubernetes job.\n %s"
|
|
@@ -793,7 +458,7 @@ class RunningJob(object):
|
|
|
793
458
|
def best_effort_kill():
|
|
794
459
|
try:
|
|
795
460
|
self.kill()
|
|
796
|
-
except:
|
|
461
|
+
except Exception:
|
|
797
462
|
pass
|
|
798
463
|
|
|
799
464
|
atexit.register(best_effort_kill)
|
|
@@ -861,7 +526,6 @@ class RunningJob(object):
|
|
|
861
526
|
if self.is_running:
|
|
862
527
|
# Case 1.
|
|
863
528
|
from kubernetes.stream import stream
|
|
864
|
-
|
|
865
529
|
api_instance = client.CoreV1Api
|
|
866
530
|
try:
|
|
867
531
|
# TODO: stream opens a web-socket connection. It may
|
|
@@ -927,6 +591,10 @@ class RunningJob(object):
|
|
|
927
591
|
return self.id
|
|
928
592
|
return "job %s" % self._name
|
|
929
593
|
|
|
594
|
+
@property
|
|
595
|
+
def is_unschedulable(self):
|
|
596
|
+
return self._job["metadata"]["annotations"].get("metaflow/job_status", "") == "Unsatisfiable_Resource_Request"
|
|
597
|
+
|
|
930
598
|
@property
|
|
931
599
|
def is_done(self):
|
|
932
600
|
# Check if the container is done. As a side effect, also refreshes self._job and
|
|
@@ -940,6 +608,7 @@ class RunningJob(object):
|
|
|
940
608
|
or bool(self._job["status"].get("failed"))
|
|
941
609
|
or self._are_pod_containers_done
|
|
942
610
|
or (self._job["spec"]["parallelism"] == 0)
|
|
611
|
+
or self.is_unschedulable
|
|
943
612
|
)
|
|
944
613
|
|
|
945
614
|
if not done():
|
|
@@ -997,6 +666,7 @@ class RunningJob(object):
|
|
|
997
666
|
bool(self._job["status"].get("failed"))
|
|
998
667
|
or self._has_any_container_failed
|
|
999
668
|
or (self._job["spec"]["parallelism"] == 0)
|
|
669
|
+
or self.is_unschedulable
|
|
1000
670
|
)
|
|
1001
671
|
return retval
|
|
1002
672
|
|
|
@@ -1094,6 +764,8 @@ class RunningJob(object):
|
|
|
1094
764
|
return 0, None
|
|
1095
765
|
# Best effort since Pod object can disappear on us at anytime
|
|
1096
766
|
else:
|
|
767
|
+
if self.is_unschedulable:
|
|
768
|
+
return 1, self._job["metadata"]["annotations"].get("metaflow/job_status_reason", "")
|
|
1097
769
|
if self._pod.get("status", {}).get("phase") not in (
|
|
1098
770
|
"Succeeded",
|
|
1099
771
|
"Failed",
|