ob-metaflow 2.12.20.1__py2.py3-none-any.whl → 2.12.23.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +11 -21
- metaflow/cli.py +24 -19
- metaflow/client/core.py +2 -2
- metaflow/cmd/develop/stub_generator.py +17 -0
- metaflow/cmd/develop/stubs.py +3 -3
- metaflow/cmd/main_cli.py +3 -2
- metaflow/extension_support/__init__.py +120 -29
- metaflow/flowspec.py +4 -0
- metaflow/info_file.py +25 -0
- metaflow/metaflow_config.py +0 -1
- metaflow/metaflow_environment.py +1 -7
- metaflow/metaflow_version.py +133 -64
- metaflow/package.py +2 -1
- metaflow/plugins/argo/argo_client.py +2 -0
- metaflow/plugins/argo/argo_workflows.py +93 -51
- metaflow/plugins/argo/argo_workflows_cli.py +26 -0
- metaflow/plugins/kubernetes/kubernetes_client.py +7 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +5 -1
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +7 -1
- metaflow/plugins/pypi/bootstrap.py +1 -1
- metaflow/plugins/pypi/conda_decorator.py +1 -1
- metaflow/plugins/pypi/micromamba.py +26 -0
- metaflow/runner/deployer.py +4 -49
- metaflow/runner/metaflow_runner.py +22 -25
- metaflow/runner/subprocess_manager.py +33 -17
- metaflow/runner/utils.py +53 -1
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/RECORD +33 -33
- metaflow/plugins/argo/daemon.py +0 -59
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/top_level.txt +0 -0
metaflow/metaflow_version.py
CHANGED
|
@@ -7,11 +7,15 @@ See the documentation of get_version for more information
|
|
|
7
7
|
|
|
8
8
|
# This file is adapted from https://github.com/aebrahim/python-git-version
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
from os import path, name,
|
|
12
|
-
import json
|
|
10
|
+
import subprocess
|
|
11
|
+
from os import path, name, environ, listdir
|
|
13
12
|
|
|
14
|
-
from metaflow import
|
|
13
|
+
from metaflow.extension_support import update_package_info
|
|
14
|
+
from metaflow.info_file import CURRENT_DIRECTORY, read_info_file
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# True/False correspond to the value `public`` in get_version
|
|
18
|
+
_version_cache = {True: None, False: None}
|
|
15
19
|
|
|
16
20
|
__all__ = ("get_version",)
|
|
17
21
|
|
|
@@ -57,87 +61,152 @@ if name == "nt":
|
|
|
57
61
|
GIT_COMMAND = find_git_on_windows()
|
|
58
62
|
|
|
59
63
|
|
|
60
|
-
def call_git_describe(abbrev=7):
|
|
64
|
+
def call_git_describe(file_to_check, abbrev=7):
|
|
61
65
|
"""return the string output of git describe"""
|
|
62
66
|
try:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
67
|
+
wd = path.dirname(file_to_check)
|
|
68
|
+
filename = path.basename(file_to_check)
|
|
69
|
+
|
|
70
|
+
# First check if the file is tracked in the GIT repository we are in
|
|
71
|
+
# We do this because in some setups and for some bizarre reason, python files
|
|
72
|
+
# are installed directly into a git repository (I am looking at you brew). We
|
|
73
|
+
# don't want to consider this a GIT install in that case.
|
|
74
|
+
args = [GIT_COMMAND, "ls-files", "--error-unmatch", filename]
|
|
75
|
+
git_return_code = subprocess.run(
|
|
76
|
+
args,
|
|
77
|
+
cwd=wd,
|
|
78
|
+
stderr=subprocess.DEVNULL,
|
|
79
|
+
stdout=subprocess.DEVNULL,
|
|
80
|
+
check=False,
|
|
81
|
+
).returncode
|
|
82
|
+
|
|
83
|
+
if git_return_code != 0:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
args = [
|
|
87
|
+
GIT_COMMAND,
|
|
88
|
+
"describe",
|
|
89
|
+
"--tags",
|
|
90
|
+
"--dirty",
|
|
91
|
+
"--long",
|
|
92
|
+
"--abbrev=%d" % abbrev,
|
|
93
|
+
]
|
|
94
|
+
return (
|
|
95
|
+
subprocess.check_output(args, cwd=wd, stderr=subprocess.DEVNULL)
|
|
96
|
+
.decode("ascii")
|
|
97
|
+
.strip()
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
except (OSError, subprocess.CalledProcessError):
|
|
85
101
|
return None
|
|
86
102
|
|
|
87
103
|
|
|
88
|
-
def format_git_describe(git_str,
|
|
104
|
+
def format_git_describe(git_str, public=False):
|
|
89
105
|
"""format the result of calling 'git describe' as a python version"""
|
|
90
106
|
if git_str is None:
|
|
91
107
|
return None
|
|
92
|
-
|
|
93
|
-
|
|
108
|
+
splits = git_str.split("-")
|
|
109
|
+
if len(splits) == 4:
|
|
110
|
+
# Formatted as <tag>-<post>-<hash>-dirty
|
|
111
|
+
tag, post, h = splits[:3]
|
|
112
|
+
dirty = "-" + splits[3]
|
|
94
113
|
else:
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
114
|
+
# Formatted as <tag>-<post>-<hash>
|
|
115
|
+
tag, post, h = splits
|
|
116
|
+
dirty = ""
|
|
117
|
+
if post == "0":
|
|
118
|
+
if public:
|
|
119
|
+
return tag
|
|
120
|
+
return tag + dirty
|
|
121
|
+
|
|
122
|
+
if public:
|
|
123
|
+
return "%s.post%s" % (tag, post)
|
|
124
|
+
|
|
125
|
+
return "%s.post%s-git%s%s" % (tag, post, h[1:], dirty)
|
|
102
126
|
|
|
103
127
|
|
|
104
128
|
def read_info_version():
|
|
105
129
|
"""Read version information from INFO file"""
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
return None
|
|
130
|
+
info_file = read_info_file()
|
|
131
|
+
if info_file:
|
|
132
|
+
return info_file.get("metaflow_version")
|
|
133
|
+
return None
|
|
111
134
|
|
|
112
135
|
|
|
113
|
-
def get_version(
|
|
136
|
+
def get_version(public=False):
|
|
114
137
|
"""Tracks the version number.
|
|
115
138
|
|
|
116
|
-
|
|
117
|
-
When True, this function returns a version
|
|
118
|
-
|
|
119
|
-
|
|
139
|
+
public: bool
|
|
140
|
+
When True, this function returns a *public* version specification which
|
|
141
|
+
doesn't include any local information (dirtiness or hash). See
|
|
142
|
+
https://packaging.python.org/en/latest/specifications/version-specifiers/#version-scheme
|
|
120
143
|
|
|
121
|
-
|
|
122
|
-
|
|
144
|
+
We first check the INFO file to see if we recorded a version of Metaflow. If there
|
|
145
|
+
is none, we check if we are in a GIT repository and if so, form the version
|
|
146
|
+
from that.
|
|
123
147
|
|
|
124
|
-
Otherwise, the version
|
|
125
|
-
|
|
126
|
-
If even that information isn't available (likely when executing on a
|
|
127
|
-
remote cloud instance), the version information is returned from INFO file
|
|
128
|
-
in the current directory.
|
|
148
|
+
Otherwise, we return the version of Metaflow that was installed.
|
|
129
149
|
|
|
130
150
|
"""
|
|
131
151
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
152
|
+
global _version_cache
|
|
153
|
+
|
|
154
|
+
# To get the version we do the following:
|
|
155
|
+
# - Check if we have a cached version. If so, return that
|
|
156
|
+
# - Then check if we have an INFO file present. If so, use that as it is
|
|
157
|
+
# the most reliable way to get the version. In particular, when running remotely,
|
|
158
|
+
# metaflow is installed in a directory and if any extension is using distutils to
|
|
159
|
+
# determine its version, this would return None and querying the version directly
|
|
160
|
+
# from the extension would fail to produce the correct result
|
|
161
|
+
# - Then if we are in the GIT repository and if so, use the git describe
|
|
162
|
+
# - If we don't have an INFO file, we look at the version information that is
|
|
163
|
+
# populated by metaflow and the extensions.
|
|
164
|
+
|
|
165
|
+
if _version_cache[public] is not None:
|
|
166
|
+
return _version_cache[public]
|
|
167
|
+
|
|
168
|
+
version = (
|
|
169
|
+
read_info_version()
|
|
170
|
+
) # Version info is cached in INFO file; includes extension info
|
|
171
|
+
|
|
172
|
+
if version:
|
|
173
|
+
_version_cache[public] = version
|
|
174
|
+
return version
|
|
175
|
+
|
|
176
|
+
# Get the version for Metaflow, favor the GIT version
|
|
177
|
+
import metaflow
|
|
178
|
+
|
|
179
|
+
version = format_git_describe(
|
|
180
|
+
call_git_describe(file_to_check=metaflow.__file__), public=public
|
|
181
|
+
)
|
|
182
|
+
if version is None:
|
|
137
183
|
version = metaflow.__version__
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
184
|
+
|
|
185
|
+
# Look for extensions and compute their versions. Properly formed extensions have
|
|
186
|
+
# a toplevel file which will contain a __mf_extensions__ value and a __version__
|
|
187
|
+
# value. We already saved the properly formed modules when loading metaflow in
|
|
188
|
+
# __ext_tl_modules__.
|
|
189
|
+
ext_versions = []
|
|
190
|
+
for pkg_name, extension_module in metaflow.__ext_tl_modules__:
|
|
191
|
+
ext_name = getattr(extension_module, "__mf_extensions__", "<unk>")
|
|
192
|
+
ext_version = format_git_describe(
|
|
193
|
+
call_git_describe(file_to_check=extension_module.__file__), public=public
|
|
194
|
+
)
|
|
195
|
+
if ext_version is None:
|
|
196
|
+
ext_version = getattr(extension_module, "__version__", "<unk>")
|
|
197
|
+
# Update the package information about reported version for the extension
|
|
198
|
+
# (only for the full info which is called at least once -- if we update more
|
|
199
|
+
# it will error out since we can only update_package_info once)
|
|
200
|
+
if not public:
|
|
201
|
+
update_package_info(
|
|
202
|
+
package_name=pkg_name,
|
|
203
|
+
extension_name=ext_name,
|
|
204
|
+
package_version=ext_version,
|
|
205
|
+
)
|
|
206
|
+
ext_versions.append("%s(%s)" % (ext_name, ext_version))
|
|
207
|
+
|
|
208
|
+
# We now have all the information about extensions so we can form the final string
|
|
209
|
+
if ext_versions:
|
|
210
|
+
version = version + "+" + ";".join(ext_versions)
|
|
211
|
+
_version_cache[public] = version
|
|
143
212
|
return version
|
metaflow/package.py
CHANGED
|
@@ -10,7 +10,8 @@ from .extension_support import EXT_PKG, package_mfext_all
|
|
|
10
10
|
from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES
|
|
11
11
|
from .exception import MetaflowException
|
|
12
12
|
from .util import to_unicode
|
|
13
|
-
from . import R
|
|
13
|
+
from . import R
|
|
14
|
+
from .info_file import INFO_FILE
|
|
14
15
|
|
|
15
16
|
DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",")
|
|
16
17
|
METAFLOW_SUFFIXES_LIST = [".py", ".html", ".css", ".js"]
|
|
@@ -295,6 +295,8 @@ class ArgoClient(object):
|
|
|
295
295
|
"suspend": schedule is None,
|
|
296
296
|
"schedule": schedule,
|
|
297
297
|
"timezone": timezone,
|
|
298
|
+
"failedJobsHistoryLimit": 10000, # default is unfortunately 1
|
|
299
|
+
"successfulJobsHistoryLimit": 10000, # default is unfortunately 3
|
|
298
300
|
"workflowSpec": {"workflowTemplateRef": {"name": name}},
|
|
299
301
|
},
|
|
300
302
|
}
|
|
@@ -456,11 +456,17 @@ class ArgoWorkflows(object):
|
|
|
456
456
|
)
|
|
457
457
|
seen.add(norm)
|
|
458
458
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
459
|
+
extra_attrs = {}
|
|
460
|
+
if param.kwargs.get("type") == JSONType:
|
|
461
|
+
param_type = str(param.kwargs.get("type").name)
|
|
462
|
+
elif isinstance(param.kwargs.get("type"), FilePathClass):
|
|
463
463
|
param_type = str(param.kwargs.get("type").name)
|
|
464
|
+
extra_attrs["is_text"] = getattr(
|
|
465
|
+
param.kwargs.get("type"), "_is_text", True
|
|
466
|
+
)
|
|
467
|
+
extra_attrs["encoding"] = getattr(
|
|
468
|
+
param.kwargs.get("type"), "_encoding", "utf-8"
|
|
469
|
+
)
|
|
464
470
|
else:
|
|
465
471
|
param_type = str(param.kwargs.get("type").__name__)
|
|
466
472
|
|
|
@@ -488,6 +494,7 @@ class ArgoWorkflows(object):
|
|
|
488
494
|
type=param_type,
|
|
489
495
|
description=param.kwargs.get("help"),
|
|
490
496
|
is_required=is_required,
|
|
497
|
+
**extra_attrs
|
|
491
498
|
)
|
|
492
499
|
return parameters
|
|
493
500
|
|
|
@@ -1484,7 +1491,11 @@ class ArgoWorkflows(object):
|
|
|
1484
1491
|
# {{foo.bar['param_name']}}.
|
|
1485
1492
|
# https://argoproj.github.io/argo-events/tutorials/02-parameterization/
|
|
1486
1493
|
# http://masterminds.github.io/sprig/strings.html
|
|
1487
|
-
|
|
1494
|
+
(
|
|
1495
|
+
"--%s='{{workflow.parameters.%s}}'"
|
|
1496
|
+
if parameter["type"] == "JSON"
|
|
1497
|
+
else "--%s={{workflow.parameters.%s}}"
|
|
1498
|
+
)
|
|
1488
1499
|
% (parameter["name"], parameter["name"])
|
|
1489
1500
|
for parameter in self.parameters.values()
|
|
1490
1501
|
]
|
|
@@ -2524,10 +2535,29 @@ class ArgoWorkflows(object):
|
|
|
2524
2535
|
# Use all the affordances available to _parameters task
|
|
2525
2536
|
executable = self.environment.executable("_parameters")
|
|
2526
2537
|
run_id = "argo-{{workflow.name}}"
|
|
2527
|
-
|
|
2528
|
-
|
|
2538
|
+
script_name = os.path.basename(sys.argv[0])
|
|
2539
|
+
entrypoint = [executable, script_name]
|
|
2540
|
+
# FlowDecorators can define their own top-level options. These might affect run level information
|
|
2541
|
+
# so it is important to pass these to the heartbeat process as well, as it might be the first task to register a run.
|
|
2542
|
+
top_opts_dict = {}
|
|
2543
|
+
for deco in flow_decorators(self.flow):
|
|
2544
|
+
top_opts_dict.update(deco.get_top_level_options())
|
|
2545
|
+
|
|
2546
|
+
top_level = list(dict_to_cli_options(top_opts_dict)) + [
|
|
2547
|
+
"--quiet",
|
|
2548
|
+
"--metadata=%s" % self.metadata.TYPE,
|
|
2549
|
+
"--environment=%s" % self.environment.TYPE,
|
|
2550
|
+
"--datastore=%s" % self.flow_datastore.TYPE,
|
|
2551
|
+
"--datastore-root=%s" % self.flow_datastore.datastore_root,
|
|
2552
|
+
"--event-logger=%s" % self.event_logger.TYPE,
|
|
2553
|
+
"--monitor=%s" % self.monitor.TYPE,
|
|
2554
|
+
"--no-pylint",
|
|
2555
|
+
"--with=argo_workflows_internal:auto-emit-argo-events=%i"
|
|
2556
|
+
% self.auto_emit_argo_events,
|
|
2557
|
+
]
|
|
2558
|
+
heartbeat_cmds = "{entrypoint} {top_level} argo-workflows heartbeat --run_id {run_id} {tags}".format(
|
|
2529
2559
|
entrypoint=" ".join(entrypoint),
|
|
2530
|
-
|
|
2560
|
+
top_level=" ".join(top_level) if top_level else "",
|
|
2531
2561
|
run_id=run_id,
|
|
2532
2562
|
tags=" ".join(["--tag %s" % t for t in self.tags]) if self.tags else "",
|
|
2533
2563
|
)
|
|
@@ -2578,12 +2608,16 @@ class ArgoWorkflows(object):
|
|
|
2578
2608
|
"METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
|
|
2579
2609
|
"METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
|
|
2580
2610
|
"METAFLOW_USER": "argo-workflows",
|
|
2611
|
+
"METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
|
|
2612
|
+
"METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
|
|
2581
2613
|
"METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
|
|
2582
2614
|
"METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
|
|
2615
|
+
"METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
|
|
2583
2616
|
"METAFLOW_KUBERNETES_WORKLOAD": 1,
|
|
2617
|
+
"METAFLOW_KUBERNETES_FETCH_EC2_METADATA": KUBERNETES_FETCH_EC2_METADATA,
|
|
2584
2618
|
"METAFLOW_RUNTIME_ENVIRONMENT": "kubernetes",
|
|
2585
2619
|
"METAFLOW_OWNER": self.username,
|
|
2586
|
-
"METAFLOW_PRODUCTION_TOKEN": self.production_token,
|
|
2620
|
+
"METAFLOW_PRODUCTION_TOKEN": self.production_token, # Used in identity resolving. This affects system tags.
|
|
2587
2621
|
}
|
|
2588
2622
|
# support Metaflow sandboxes
|
|
2589
2623
|
env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
|
|
@@ -2606,50 +2640,54 @@ class ArgoWorkflows(object):
|
|
|
2606
2640
|
)
|
|
2607
2641
|
from kubernetes import client as kubernetes_sdk
|
|
2608
2642
|
|
|
2609
|
-
return
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2643
|
+
return (
|
|
2644
|
+
DaemonTemplate("heartbeat-daemon")
|
|
2645
|
+
.service_account_name(resources["service_account"])
|
|
2646
|
+
.container(
|
|
2647
|
+
to_camelcase(
|
|
2648
|
+
kubernetes_sdk.V1Container(
|
|
2649
|
+
name="main",
|
|
2650
|
+
# TODO: Make the image configurable
|
|
2651
|
+
image=resources["image"],
|
|
2652
|
+
command=cmds,
|
|
2653
|
+
env=[
|
|
2654
|
+
kubernetes_sdk.V1EnvVar(name=k, value=str(v))
|
|
2655
|
+
for k, v in env.items()
|
|
2656
|
+
],
|
|
2657
|
+
env_from=[
|
|
2658
|
+
kubernetes_sdk.V1EnvFromSource(
|
|
2659
|
+
secret_ref=kubernetes_sdk.V1SecretEnvSource(
|
|
2660
|
+
name=str(k),
|
|
2661
|
+
# optional=True
|
|
2662
|
+
)
|
|
2625
2663
|
)
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2664
|
+
for k in list(
|
|
2665
|
+
[]
|
|
2666
|
+
if not resources.get("secrets")
|
|
2667
|
+
else (
|
|
2668
|
+
[resources.get("secrets")]
|
|
2669
|
+
if isinstance(resources.get("secrets"), str)
|
|
2670
|
+
else resources.get("secrets")
|
|
2671
|
+
)
|
|
2634
2672
|
)
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
)
|
|
2673
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
2674
|
+
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
2675
|
+
if k
|
|
2676
|
+
],
|
|
2677
|
+
resources=kubernetes_sdk.V1ResourceRequirements(
|
|
2678
|
+
# NOTE: base resources for this are kept to a minimum to save on running costs.
|
|
2679
|
+
# This has an adverse effect on startup time for the daemon, which can be completely
|
|
2680
|
+
# alleviated by using a base image that has the required dependencies pre-installed
|
|
2681
|
+
requests={
|
|
2682
|
+
"cpu": "200m",
|
|
2683
|
+
"memory": "100Mi",
|
|
2684
|
+
},
|
|
2685
|
+
limits={
|
|
2686
|
+
"cpu": "200m",
|
|
2687
|
+
"memory": "100Mi",
|
|
2688
|
+
},
|
|
2689
|
+
),
|
|
2690
|
+
)
|
|
2653
2691
|
)
|
|
2654
2692
|
)
|
|
2655
2693
|
)
|
|
@@ -3271,6 +3309,10 @@ class DaemonTemplate(object):
|
|
|
3271
3309
|
self.payload["container"] = container
|
|
3272
3310
|
return self
|
|
3273
3311
|
|
|
3312
|
+
def service_account_name(self, service_account_name):
|
|
3313
|
+
self.payload["serviceAccountName"] = service_account_name
|
|
3314
|
+
return self
|
|
3315
|
+
|
|
3274
3316
|
def to_json(self):
|
|
3275
3317
|
return self.payload
|
|
3276
3318
|
|
|
@@ -4,6 +4,7 @@ import platform
|
|
|
4
4
|
import re
|
|
5
5
|
import sys
|
|
6
6
|
from hashlib import sha1
|
|
7
|
+
from time import sleep
|
|
7
8
|
|
|
8
9
|
from metaflow import JSONType, Run, current, decorators, parameters
|
|
9
10
|
from metaflow._vendor import click
|
|
@@ -959,6 +960,31 @@ def list_workflow_templates(obj, all=None):
|
|
|
959
960
|
obj.echo_always(template_name)
|
|
960
961
|
|
|
961
962
|
|
|
963
|
+
# Internal CLI command to run a heartbeat daemon in an Argo Workflows Daemon container.
|
|
964
|
+
@argo_workflows.command(hidden=True, help="start heartbeat process for a run")
|
|
965
|
+
@click.option("--run_id", required=True)
|
|
966
|
+
@click.option(
|
|
967
|
+
"--tag",
|
|
968
|
+
"tags",
|
|
969
|
+
multiple=True,
|
|
970
|
+
default=None,
|
|
971
|
+
help="Annotate all objects produced by Argo Workflows runs "
|
|
972
|
+
"with the given tag. You can specify this option multiple "
|
|
973
|
+
"times to attach multiple tags.",
|
|
974
|
+
)
|
|
975
|
+
@click.pass_obj
|
|
976
|
+
def heartbeat(obj, run_id, tags=None):
|
|
977
|
+
# Try to register a run in case the start task has not taken care of it yet.
|
|
978
|
+
obj.metadata.register_run_id(run_id, tags)
|
|
979
|
+
# Start run heartbeat
|
|
980
|
+
obj.metadata.start_run_heartbeat(obj.flow.name, run_id)
|
|
981
|
+
# Keepalive loop
|
|
982
|
+
while True:
|
|
983
|
+
# Do not pollute daemon logs with anything unnecessary,
|
|
984
|
+
# as they might be extremely long running.
|
|
985
|
+
sleep(10)
|
|
986
|
+
|
|
987
|
+
|
|
962
988
|
def validate_run_id(
|
|
963
989
|
workflow_name, token_prefix, authorize, run_id, instructions_fn=None
|
|
964
990
|
):
|
|
@@ -121,7 +121,10 @@ class KubernetesClient(object):
|
|
|
121
121
|
job_api = self._client.BatchV1Api()
|
|
122
122
|
pods = self._find_active_pods(flow_name, run_id, user)
|
|
123
123
|
|
|
124
|
+
active_pods = False
|
|
125
|
+
|
|
124
126
|
def _kill_pod(pod):
|
|
127
|
+
active_pods = True
|
|
125
128
|
echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
|
|
126
129
|
try:
|
|
127
130
|
stream(
|
|
@@ -155,7 +158,10 @@ class KubernetesClient(object):
|
|
|
155
158
|
echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
|
|
156
159
|
|
|
157
160
|
with ThreadPoolExecutor() as executor:
|
|
158
|
-
executor.map(_kill_pod,
|
|
161
|
+
executor.map(_kill_pod, pods)
|
|
162
|
+
|
|
163
|
+
if not active_pods:
|
|
164
|
+
echo("No active Kubernetes pods found for run *%s*" % run_id)
|
|
159
165
|
|
|
160
166
|
def jobset(self, **kwargs):
|
|
161
167
|
return KubernetesJobSet(self, **kwargs)
|
|
@@ -558,7 +558,11 @@ class KubernetesDecorator(StepDecorator):
|
|
|
558
558
|
|
|
559
559
|
# TODO: Unify this method with the multi-node setup in @batch
|
|
560
560
|
def _setup_multinode_environment():
|
|
561
|
-
# FIXME
|
|
561
|
+
# TODO [FIXME SOON]
|
|
562
|
+
# Even if Kubernetes may deploy control pods before worker pods, there is always a
|
|
563
|
+
# possibility that the worker pods may start before the control. In the case that this happens,
|
|
564
|
+
# the worker pods will not be able to resolve the control pod's IP address and this will cause
|
|
565
|
+
# the worker pods to fail. This function should account for this in the near future.
|
|
562
566
|
import socket
|
|
563
567
|
|
|
564
568
|
try:
|
|
@@ -866,7 +866,13 @@ class KubernetesJobSet(object):
|
|
|
866
866
|
spec=dict(
|
|
867
867
|
replicatedJobs=[self.control.dump(), self.worker.dump()],
|
|
868
868
|
suspend=False,
|
|
869
|
-
startupPolicy=
|
|
869
|
+
startupPolicy=dict(
|
|
870
|
+
# We explicitly set an InOrder Startup policy so that
|
|
871
|
+
# we can ensure that the control pod starts before the worker pods.
|
|
872
|
+
# This is required so that when worker pods try to access the control's IP
|
|
873
|
+
# we are able to resolve the control's IP address.
|
|
874
|
+
startupPolicyOrder="InOrder"
|
|
875
|
+
),
|
|
870
876
|
successPolicy=None,
|
|
871
877
|
# The Failure Policy helps setting the number of retries for the jobset.
|
|
872
878
|
# but we don't rely on it and instead rely on either the local scheduler
|
|
@@ -89,7 +89,7 @@ if __name__ == "__main__":
|
|
|
89
89
|
# TODO: micromamba installation can be pawned off to micromamba.py
|
|
90
90
|
f"""set -e;
|
|
91
91
|
if ! command -v micromamba >/dev/null 2>&1; then
|
|
92
|
-
mkdir micromamba;
|
|
92
|
+
mkdir -p micromamba;
|
|
93
93
|
python -c "import requests, bz2, sys; data = requests.get('https://micro.mamba.pm/api/micromamba/{architecture}/1.5.7').content; sys.stdout.buffer.write(bz2.decompress(data))" | tar -xv -C $(pwd)/micromamba bin/micromamba --strip-components 1;
|
|
94
94
|
export PATH=$PATH:$(pwd)/micromamba;
|
|
95
95
|
if ! command -v micromamba >/dev/null 2>&1; then
|
|
@@ -12,7 +12,7 @@ from metaflow.metadata import MetaDatum
|
|
|
12
12
|
from metaflow.metaflow_environment import InvalidEnvironmentException
|
|
13
13
|
from metaflow.util import get_metaflow_root
|
|
14
14
|
|
|
15
|
-
from ... import INFO_FILE
|
|
15
|
+
from ...info_file import INFO_FILE
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class CondaStepDecorator(StepDecorator):
|
|
@@ -253,7 +253,33 @@ class Micromamba(object):
|
|
|
253
253
|
try:
|
|
254
254
|
output = json.loads(e.output)
|
|
255
255
|
err = []
|
|
256
|
+
v_pkgs = ["__cuda", "__glibc"]
|
|
256
257
|
for error in output.get("solver_problems", []):
|
|
258
|
+
# raise a specific error message for virtual package related errors
|
|
259
|
+
match = next((p for p in v_pkgs if p in error), None)
|
|
260
|
+
if match is not None:
|
|
261
|
+
vpkg_name = match[2:]
|
|
262
|
+
# try to strip version from error msg which are of the format:
|
|
263
|
+
# nothing provides <__vpkg> >=2.17,<3.0.a0 needed by <pkg_name>
|
|
264
|
+
try:
|
|
265
|
+
vpkg_version = error[
|
|
266
|
+
len("nothing provides %s " % match) : error.index(
|
|
267
|
+
" needed by"
|
|
268
|
+
)
|
|
269
|
+
]
|
|
270
|
+
except ValueError:
|
|
271
|
+
vpkg_version = None
|
|
272
|
+
raise MicromambaException(
|
|
273
|
+
"Please set the environment variable CONDA_OVERRIDE_{var} to a specific version{version} of {name}.\n"
|
|
274
|
+
"Here is an example of supplying environment variables through the command line -\n\n"
|
|
275
|
+
"CONDA_OVERRIDE_{var}=<{name}-version> python flow.py <args>".format(
|
|
276
|
+
var=vpkg_name.upper(),
|
|
277
|
+
version=(
|
|
278
|
+
"" if not vpkg_version else (" (%s)" % vpkg_version)
|
|
279
|
+
),
|
|
280
|
+
name=vpkg_name,
|
|
281
|
+
),
|
|
282
|
+
)
|
|
257
283
|
err.append(error)
|
|
258
284
|
raise MicromambaException(
|
|
259
285
|
msg.format(
|