ob-metaflow 2.12.20.1__py2.py3-none-any.whl → 2.12.23.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. metaflow/__init__.py +11 -21
  2. metaflow/cli.py +24 -19
  3. metaflow/client/core.py +2 -2
  4. metaflow/cmd/develop/stub_generator.py +17 -0
  5. metaflow/cmd/develop/stubs.py +3 -3
  6. metaflow/cmd/main_cli.py +3 -2
  7. metaflow/extension_support/__init__.py +120 -29
  8. metaflow/flowspec.py +4 -0
  9. metaflow/info_file.py +25 -0
  10. metaflow/metaflow_config.py +0 -1
  11. metaflow/metaflow_environment.py +1 -7
  12. metaflow/metaflow_version.py +133 -64
  13. metaflow/package.py +2 -1
  14. metaflow/plugins/argo/argo_client.py +2 -0
  15. metaflow/plugins/argo/argo_workflows.py +93 -51
  16. metaflow/plugins/argo/argo_workflows_cli.py +26 -0
  17. metaflow/plugins/kubernetes/kubernetes_client.py +7 -1
  18. metaflow/plugins/kubernetes/kubernetes_decorator.py +5 -1
  19. metaflow/plugins/kubernetes/kubernetes_jobsets.py +7 -1
  20. metaflow/plugins/pypi/bootstrap.py +1 -1
  21. metaflow/plugins/pypi/conda_decorator.py +1 -1
  22. metaflow/plugins/pypi/micromamba.py +26 -0
  23. metaflow/runner/deployer.py +4 -49
  24. metaflow/runner/metaflow_runner.py +22 -25
  25. metaflow/runner/subprocess_manager.py +33 -17
  26. metaflow/runner/utils.py +53 -1
  27. metaflow/version.py +1 -1
  28. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/METADATA +2 -2
  29. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/RECORD +33 -33
  30. metaflow/plugins/argo/daemon.py +0 -59
  31. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/LICENSE +0 -0
  32. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/WHEEL +0 -0
  33. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/entry_points.txt +0 -0
  34. {ob_metaflow-2.12.20.1.dist-info → ob_metaflow-2.12.23.1.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,15 @@ See the documentation of get_version for more information
7
7
 
8
8
  # This file is adapted from https://github.com/aebrahim/python-git-version
9
9
 
10
- from subprocess import check_output, CalledProcessError
11
- from os import path, name, devnull, environ, listdir
12
- import json
10
+ import subprocess
11
+ from os import path, name, environ, listdir
13
12
 
14
- from metaflow import CURRENT_DIRECTORY, INFO_FILE
13
+ from metaflow.extension_support import update_package_info
14
+ from metaflow.info_file import CURRENT_DIRECTORY, read_info_file
15
+
16
+
17
+ # True/False correspond to the value `public`` in get_version
18
+ _version_cache = {True: None, False: None}
15
19
 
16
20
  __all__ = ("get_version",)
17
21
 
@@ -57,87 +61,152 @@ if name == "nt":
57
61
  GIT_COMMAND = find_git_on_windows()
58
62
 
59
63
 
60
- def call_git_describe(abbrev=7):
64
+ def call_git_describe(file_to_check, abbrev=7):
61
65
  """return the string output of git describe"""
62
66
  try:
63
-
64
- # first, make sure we are actually in a Metaflow repo,
65
- # not some other repo
66
- with open(devnull, "w") as fnull:
67
- arguments = [GIT_COMMAND, "rev-parse", "--show-toplevel"]
68
- reponame = (
69
- check_output(arguments, cwd=CURRENT_DIRECTORY, stderr=fnull)
70
- .decode("ascii")
71
- .strip()
72
- )
73
- if path.basename(reponame) != "metaflow":
74
- return None
75
-
76
- with open(devnull, "w") as fnull:
77
- arguments = [GIT_COMMAND, "describe", "--tags", "--abbrev=%d" % abbrev]
78
- return (
79
- check_output(arguments, cwd=CURRENT_DIRECTORY, stderr=fnull)
80
- .decode("ascii")
81
- .strip()
82
- )
83
-
84
- except (OSError, CalledProcessError):
67
+ wd = path.dirname(file_to_check)
68
+ filename = path.basename(file_to_check)
69
+
70
+ # First check if the file is tracked in the GIT repository we are in
71
+ # We do this because in some setups and for some bizarre reason, python files
72
+ # are installed directly into a git repository (I am looking at you brew). We
73
+ # don't want to consider this a GIT install in that case.
74
+ args = [GIT_COMMAND, "ls-files", "--error-unmatch", filename]
75
+ git_return_code = subprocess.run(
76
+ args,
77
+ cwd=wd,
78
+ stderr=subprocess.DEVNULL,
79
+ stdout=subprocess.DEVNULL,
80
+ check=False,
81
+ ).returncode
82
+
83
+ if git_return_code != 0:
84
+ return None
85
+
86
+ args = [
87
+ GIT_COMMAND,
88
+ "describe",
89
+ "--tags",
90
+ "--dirty",
91
+ "--long",
92
+ "--abbrev=%d" % abbrev,
93
+ ]
94
+ return (
95
+ subprocess.check_output(args, cwd=wd, stderr=subprocess.DEVNULL)
96
+ .decode("ascii")
97
+ .strip()
98
+ )
99
+
100
+ except (OSError, subprocess.CalledProcessError):
85
101
  return None
86
102
 
87
103
 
88
- def format_git_describe(git_str, pep440=False):
104
+ def format_git_describe(git_str, public=False):
89
105
  """format the result of calling 'git describe' as a python version"""
90
106
  if git_str is None:
91
107
  return None
92
- if "-" not in git_str: # currently at a tag
93
- return git_str
108
+ splits = git_str.split("-")
109
+ if len(splits) == 4:
110
+ # Formatted as <tag>-<post>-<hash>-dirty
111
+ tag, post, h = splits[:3]
112
+ dirty = "-" + splits[3]
94
113
  else:
95
- # formatted as version-N-githash
96
- # want to convert to version.postN-githash
97
- git_str = git_str.replace("-", ".post", 1)
98
- if pep440: # does not allow git hash afterwards
99
- return git_str.split("-")[0]
100
- else:
101
- return git_str.replace("-g", "+git")
114
+ # Formatted as <tag>-<post>-<hash>
115
+ tag, post, h = splits
116
+ dirty = ""
117
+ if post == "0":
118
+ if public:
119
+ return tag
120
+ return tag + dirty
121
+
122
+ if public:
123
+ return "%s.post%s" % (tag, post)
124
+
125
+ return "%s.post%s-git%s%s" % (tag, post, h[1:], dirty)
102
126
 
103
127
 
104
128
  def read_info_version():
105
129
  """Read version information from INFO file"""
106
- try:
107
- with open(INFO_FILE, "r") as contents:
108
- return json.load(contents).get("metaflow_version")
109
- except IOError:
110
- return None
130
+ info_file = read_info_file()
131
+ if info_file:
132
+ return info_file.get("metaflow_version")
133
+ return None
111
134
 
112
135
 
113
- def get_version(pep440=False):
136
+ def get_version(public=False):
114
137
  """Tracks the version number.
115
138
 
116
- pep440: bool
117
- When True, this function returns a version string suitable for
118
- a release as defined by PEP 440. When False, the githash (if
119
- available) will be appended to the version string.
139
+ public: bool
140
+ When True, this function returns a *public* version specification which
141
+ doesn't include any local information (dirtiness or hash). See
142
+ https://packaging.python.org/en/latest/specifications/version-specifiers/#version-scheme
120
143
 
121
- If the script is located within an active git repository,
122
- git-describe is used to get the version information.
144
+ We first check the INFO file to see if we recorded a version of Metaflow. If there
145
+ is none, we check if we are in a GIT repository and if so, form the version
146
+ from that.
123
147
 
124
- Otherwise, the version logged by package installer is returned.
125
-
126
- If even that information isn't available (likely when executing on a
127
- remote cloud instance), the version information is returned from INFO file
128
- in the current directory.
148
+ Otherwise, we return the version of Metaflow that was installed.
129
149
 
130
150
  """
131
151
 
132
- version = format_git_describe(call_git_describe(), pep440=pep440)
133
- version_addl = None
134
- if version is None: # not a git repository
135
- import metaflow
136
-
152
+ global _version_cache
153
+
154
+ # To get the version we do the following:
155
+ # - Check if we have a cached version. If so, return that
156
+ # - Then check if we have an INFO file present. If so, use that as it is
157
+ # the most reliable way to get the version. In particular, when running remotely,
158
+ # metaflow is installed in a directory and if any extension is using distutils to
159
+ # determine its version, this would return None and querying the version directly
160
+ # from the extension would fail to produce the correct result
161
+ # - Then if we are in the GIT repository and if so, use the git describe
162
+ # - If we don't have an INFO file, we look at the version information that is
163
+ # populated by metaflow and the extensions.
164
+
165
+ if _version_cache[public] is not None:
166
+ return _version_cache[public]
167
+
168
+ version = (
169
+ read_info_version()
170
+ ) # Version info is cached in INFO file; includes extension info
171
+
172
+ if version:
173
+ _version_cache[public] = version
174
+ return version
175
+
176
+ # Get the version for Metaflow, favor the GIT version
177
+ import metaflow
178
+
179
+ version = format_git_describe(
180
+ call_git_describe(file_to_check=metaflow.__file__), public=public
181
+ )
182
+ if version is None:
137
183
  version = metaflow.__version__
138
- version_addl = metaflow.__version_addl__
139
- if version is None: # not a proper python package
140
- return read_info_version()
141
- if version_addl:
142
- return "+".join([version, version_addl])
184
+
185
+ # Look for extensions and compute their versions. Properly formed extensions have
186
+ # a toplevel file which will contain a __mf_extensions__ value and a __version__
187
+ # value. We already saved the properly formed modules when loading metaflow in
188
+ # __ext_tl_modules__.
189
+ ext_versions = []
190
+ for pkg_name, extension_module in metaflow.__ext_tl_modules__:
191
+ ext_name = getattr(extension_module, "__mf_extensions__", "<unk>")
192
+ ext_version = format_git_describe(
193
+ call_git_describe(file_to_check=extension_module.__file__), public=public
194
+ )
195
+ if ext_version is None:
196
+ ext_version = getattr(extension_module, "__version__", "<unk>")
197
+ # Update the package information about reported version for the extension
198
+ # (only for the full info which is called at least once -- if we update more
199
+ # it will error out since we can only update_package_info once)
200
+ if not public:
201
+ update_package_info(
202
+ package_name=pkg_name,
203
+ extension_name=ext_name,
204
+ package_version=ext_version,
205
+ )
206
+ ext_versions.append("%s(%s)" % (ext_name, ext_version))
207
+
208
+ # We now have all the information about extensions so we can form the final string
209
+ if ext_versions:
210
+ version = version + "+" + ";".join(ext_versions)
211
+ _version_cache[public] = version
143
212
  return version
metaflow/package.py CHANGED
@@ -10,7 +10,8 @@ from .extension_support import EXT_PKG, package_mfext_all
10
10
  from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES
11
11
  from .exception import MetaflowException
12
12
  from .util import to_unicode
13
- from . import R, INFO_FILE
13
+ from . import R
14
+ from .info_file import INFO_FILE
14
15
 
15
16
  DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",")
16
17
  METAFLOW_SUFFIXES_LIST = [".py", ".html", ".css", ".js"]
@@ -295,6 +295,8 @@ class ArgoClient(object):
295
295
  "suspend": schedule is None,
296
296
  "schedule": schedule,
297
297
  "timezone": timezone,
298
+ "failedJobsHistoryLimit": 10000, # default is unfortunately 1
299
+ "successfulJobsHistoryLimit": 10000, # default is unfortunately 3
298
300
  "workflowSpec": {"workflowTemplateRef": {"name": name}},
299
301
  },
300
302
  }
@@ -456,11 +456,17 @@ class ArgoWorkflows(object):
456
456
  )
457
457
  seen.add(norm)
458
458
 
459
- if param.kwargs.get("type") == JSONType or isinstance(
460
- param.kwargs.get("type"), FilePathClass
461
- ):
462
- # Special-case this to avoid touching core
459
+ extra_attrs = {}
460
+ if param.kwargs.get("type") == JSONType:
461
+ param_type = str(param.kwargs.get("type").name)
462
+ elif isinstance(param.kwargs.get("type"), FilePathClass):
463
463
  param_type = str(param.kwargs.get("type").name)
464
+ extra_attrs["is_text"] = getattr(
465
+ param.kwargs.get("type"), "_is_text", True
466
+ )
467
+ extra_attrs["encoding"] = getattr(
468
+ param.kwargs.get("type"), "_encoding", "utf-8"
469
+ )
464
470
  else:
465
471
  param_type = str(param.kwargs.get("type").__name__)
466
472
 
@@ -488,6 +494,7 @@ class ArgoWorkflows(object):
488
494
  type=param_type,
489
495
  description=param.kwargs.get("help"),
490
496
  is_required=is_required,
497
+ **extra_attrs
491
498
  )
492
499
  return parameters
493
500
 
@@ -1484,7 +1491,11 @@ class ArgoWorkflows(object):
1484
1491
  # {{foo.bar['param_name']}}.
1485
1492
  # https://argoproj.github.io/argo-events/tutorials/02-parameterization/
1486
1493
  # http://masterminds.github.io/sprig/strings.html
1487
- "--%s={{workflow.parameters.%s}}"
1494
+ (
1495
+ "--%s='{{workflow.parameters.%s}}'"
1496
+ if parameter["type"] == "JSON"
1497
+ else "--%s={{workflow.parameters.%s}}"
1498
+ )
1488
1499
  % (parameter["name"], parameter["name"])
1489
1500
  for parameter in self.parameters.values()
1490
1501
  ]
@@ -2524,10 +2535,29 @@ class ArgoWorkflows(object):
2524
2535
  # Use all the affordances available to _parameters task
2525
2536
  executable = self.environment.executable("_parameters")
2526
2537
  run_id = "argo-{{workflow.name}}"
2527
- entrypoint = [executable, "-m metaflow.plugins.argo.daemon"]
2528
- heartbeat_cmds = "{entrypoint} --flow_name {flow_name} --run_id {run_id} {tags} heartbeat".format(
2538
+ script_name = os.path.basename(sys.argv[0])
2539
+ entrypoint = [executable, script_name]
2540
+ # FlowDecorators can define their own top-level options. These might affect run level information
2541
+ # so it is important to pass these to the heartbeat process as well, as it might be the first task to register a run.
2542
+ top_opts_dict = {}
2543
+ for deco in flow_decorators(self.flow):
2544
+ top_opts_dict.update(deco.get_top_level_options())
2545
+
2546
+ top_level = list(dict_to_cli_options(top_opts_dict)) + [
2547
+ "--quiet",
2548
+ "--metadata=%s" % self.metadata.TYPE,
2549
+ "--environment=%s" % self.environment.TYPE,
2550
+ "--datastore=%s" % self.flow_datastore.TYPE,
2551
+ "--datastore-root=%s" % self.flow_datastore.datastore_root,
2552
+ "--event-logger=%s" % self.event_logger.TYPE,
2553
+ "--monitor=%s" % self.monitor.TYPE,
2554
+ "--no-pylint",
2555
+ "--with=argo_workflows_internal:auto-emit-argo-events=%i"
2556
+ % self.auto_emit_argo_events,
2557
+ ]
2558
+ heartbeat_cmds = "{entrypoint} {top_level} argo-workflows heartbeat --run_id {run_id} {tags}".format(
2529
2559
  entrypoint=" ".join(entrypoint),
2530
- flow_name=self.flow.name,
2560
+ top_level=" ".join(top_level) if top_level else "",
2531
2561
  run_id=run_id,
2532
2562
  tags=" ".join(["--tag %s" % t for t in self.tags]) if self.tags else "",
2533
2563
  )
@@ -2578,12 +2608,16 @@ class ArgoWorkflows(object):
2578
2608
  "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
2579
2609
  "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
2580
2610
  "METAFLOW_USER": "argo-workflows",
2611
+ "METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
2612
+ "METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
2581
2613
  "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
2582
2614
  "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
2615
+ "METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
2583
2616
  "METAFLOW_KUBERNETES_WORKLOAD": 1,
2617
+ "METAFLOW_KUBERNETES_FETCH_EC2_METADATA": KUBERNETES_FETCH_EC2_METADATA,
2584
2618
  "METAFLOW_RUNTIME_ENVIRONMENT": "kubernetes",
2585
2619
  "METAFLOW_OWNER": self.username,
2586
- "METAFLOW_PRODUCTION_TOKEN": self.production_token,
2620
+ "METAFLOW_PRODUCTION_TOKEN": self.production_token, # Used in identity resolving. This affects system tags.
2587
2621
  }
2588
2622
  # support Metaflow sandboxes
2589
2623
  env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
@@ -2606,50 +2640,54 @@ class ArgoWorkflows(object):
2606
2640
  )
2607
2641
  from kubernetes import client as kubernetes_sdk
2608
2642
 
2609
- return DaemonTemplate("heartbeat-daemon").container(
2610
- to_camelcase(
2611
- kubernetes_sdk.V1Container(
2612
- name="main",
2613
- # TODO: Make the image configurable
2614
- image=resources["image"],
2615
- command=cmds,
2616
- env=[
2617
- kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2618
- for k, v in env.items()
2619
- ],
2620
- env_from=[
2621
- kubernetes_sdk.V1EnvFromSource(
2622
- secret_ref=kubernetes_sdk.V1SecretEnvSource(
2623
- name=str(k),
2624
- # optional=True
2643
+ return (
2644
+ DaemonTemplate("heartbeat-daemon")
2645
+ .service_account_name(resources["service_account"])
2646
+ .container(
2647
+ to_camelcase(
2648
+ kubernetes_sdk.V1Container(
2649
+ name="main",
2650
+ # TODO: Make the image configurable
2651
+ image=resources["image"],
2652
+ command=cmds,
2653
+ env=[
2654
+ kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2655
+ for k, v in env.items()
2656
+ ],
2657
+ env_from=[
2658
+ kubernetes_sdk.V1EnvFromSource(
2659
+ secret_ref=kubernetes_sdk.V1SecretEnvSource(
2660
+ name=str(k),
2661
+ # optional=True
2662
+ )
2625
2663
  )
2626
- )
2627
- for k in list(
2628
- []
2629
- if not resources.get("secrets")
2630
- else (
2631
- [resources.get("secrets")]
2632
- if isinstance(resources.get("secrets"), str)
2633
- else resources.get("secrets")
2664
+ for k in list(
2665
+ []
2666
+ if not resources.get("secrets")
2667
+ else (
2668
+ [resources.get("secrets")]
2669
+ if isinstance(resources.get("secrets"), str)
2670
+ else resources.get("secrets")
2671
+ )
2634
2672
  )
2635
- )
2636
- + KUBERNETES_SECRETS.split(",")
2637
- + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2638
- if k
2639
- ],
2640
- resources=kubernetes_sdk.V1ResourceRequirements(
2641
- # NOTE: base resources for this are kept to a minimum to save on running costs.
2642
- # This has an adverse effect on startup time for the daemon, which can be completely
2643
- # alleviated by using a base image that has the required dependencies pre-installed
2644
- requests={
2645
- "cpu": "200m",
2646
- "memory": "100Mi",
2647
- },
2648
- limits={
2649
- "cpu": "200m",
2650
- "memory": "100Mi",
2651
- },
2652
- ),
2673
+ + KUBERNETES_SECRETS.split(",")
2674
+ + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2675
+ if k
2676
+ ],
2677
+ resources=kubernetes_sdk.V1ResourceRequirements(
2678
+ # NOTE: base resources for this are kept to a minimum to save on running costs.
2679
+ # This has an adverse effect on startup time for the daemon, which can be completely
2680
+ # alleviated by using a base image that has the required dependencies pre-installed
2681
+ requests={
2682
+ "cpu": "200m",
2683
+ "memory": "100Mi",
2684
+ },
2685
+ limits={
2686
+ "cpu": "200m",
2687
+ "memory": "100Mi",
2688
+ },
2689
+ ),
2690
+ )
2653
2691
  )
2654
2692
  )
2655
2693
  )
@@ -3271,6 +3309,10 @@ class DaemonTemplate(object):
3271
3309
  self.payload["container"] = container
3272
3310
  return self
3273
3311
 
3312
+ def service_account_name(self, service_account_name):
3313
+ self.payload["serviceAccountName"] = service_account_name
3314
+ return self
3315
+
3274
3316
  def to_json(self):
3275
3317
  return self.payload
3276
3318
 
@@ -4,6 +4,7 @@ import platform
4
4
  import re
5
5
  import sys
6
6
  from hashlib import sha1
7
+ from time import sleep
7
8
 
8
9
  from metaflow import JSONType, Run, current, decorators, parameters
9
10
  from metaflow._vendor import click
@@ -959,6 +960,31 @@ def list_workflow_templates(obj, all=None):
959
960
  obj.echo_always(template_name)
960
961
 
961
962
 
963
+ # Internal CLI command to run a heartbeat daemon in an Argo Workflows Daemon container.
964
+ @argo_workflows.command(hidden=True, help="start heartbeat process for a run")
965
+ @click.option("--run_id", required=True)
966
+ @click.option(
967
+ "--tag",
968
+ "tags",
969
+ multiple=True,
970
+ default=None,
971
+ help="Annotate all objects produced by Argo Workflows runs "
972
+ "with the given tag. You can specify this option multiple "
973
+ "times to attach multiple tags.",
974
+ )
975
+ @click.pass_obj
976
+ def heartbeat(obj, run_id, tags=None):
977
+ # Try to register a run in case the start task has not taken care of it yet.
978
+ obj.metadata.register_run_id(run_id, tags)
979
+ # Start run heartbeat
980
+ obj.metadata.start_run_heartbeat(obj.flow.name, run_id)
981
+ # Keepalive loop
982
+ while True:
983
+ # Do not pollute daemon logs with anything unnecessary,
984
+ # as they might be extremely long running.
985
+ sleep(10)
986
+
987
+
962
988
  def validate_run_id(
963
989
  workflow_name, token_prefix, authorize, run_id, instructions_fn=None
964
990
  ):
@@ -121,7 +121,10 @@ class KubernetesClient(object):
121
121
  job_api = self._client.BatchV1Api()
122
122
  pods = self._find_active_pods(flow_name, run_id, user)
123
123
 
124
+ active_pods = False
125
+
124
126
  def _kill_pod(pod):
127
+ active_pods = True
125
128
  echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
126
129
  try:
127
130
  stream(
@@ -155,7 +158,10 @@ class KubernetesClient(object):
155
158
  echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
156
159
 
157
160
  with ThreadPoolExecutor() as executor:
158
- executor.map(_kill_pod, list(pods))
161
+ executor.map(_kill_pod, pods)
162
+
163
+ if not active_pods:
164
+ echo("No active Kubernetes pods found for run *%s*" % run_id)
159
165
 
160
166
  def jobset(self, **kwargs):
161
167
  return KubernetesJobSet(self, **kwargs)
@@ -558,7 +558,11 @@ class KubernetesDecorator(StepDecorator):
558
558
 
559
559
  # TODO: Unify this method with the multi-node setup in @batch
560
560
  def _setup_multinode_environment():
561
- # FIXME: what about MF_MASTER_PORT
561
+ # TODO [FIXME SOON]
562
+ # Even if Kubernetes may deploy control pods before worker pods, there is always a
563
+ # possibility that the worker pods may start before the control. In the case that this happens,
564
+ # the worker pods will not be able to resolve the control pod's IP address and this will cause
565
+ # the worker pods to fail. This function should account for this in the near future.
562
566
  import socket
563
567
 
564
568
  try:
@@ -866,7 +866,13 @@ class KubernetesJobSet(object):
866
866
  spec=dict(
867
867
  replicatedJobs=[self.control.dump(), self.worker.dump()],
868
868
  suspend=False,
869
- startupPolicy=None,
869
+ startupPolicy=dict(
870
+ # We explicitly set an InOrder Startup policy so that
871
+ # we can ensure that the control pod starts before the worker pods.
872
+ # This is required so that when worker pods try to access the control's IP
873
+ # we are able to resolve the control's IP address.
874
+ startupPolicyOrder="InOrder"
875
+ ),
870
876
  successPolicy=None,
871
877
  # The Failure Policy helps setting the number of retries for the jobset.
872
878
  # but we don't rely on it and instead rely on either the local scheduler
@@ -89,7 +89,7 @@ if __name__ == "__main__":
89
89
  # TODO: micromamba installation can be pawned off to micromamba.py
90
90
  f"""set -e;
91
91
  if ! command -v micromamba >/dev/null 2>&1; then
92
- mkdir micromamba;
92
+ mkdir -p micromamba;
93
93
  python -c "import requests, bz2, sys; data = requests.get('https://micro.mamba.pm/api/micromamba/{architecture}/1.5.7').content; sys.stdout.buffer.write(bz2.decompress(data))" | tar -xv -C $(pwd)/micromamba bin/micromamba --strip-components 1;
94
94
  export PATH=$PATH:$(pwd)/micromamba;
95
95
  if ! command -v micromamba >/dev/null 2>&1; then
@@ -12,7 +12,7 @@ from metaflow.metadata import MetaDatum
12
12
  from metaflow.metaflow_environment import InvalidEnvironmentException
13
13
  from metaflow.util import get_metaflow_root
14
14
 
15
- from ... import INFO_FILE
15
+ from ...info_file import INFO_FILE
16
16
 
17
17
 
18
18
  class CondaStepDecorator(StepDecorator):
@@ -253,7 +253,33 @@ class Micromamba(object):
253
253
  try:
254
254
  output = json.loads(e.output)
255
255
  err = []
256
+ v_pkgs = ["__cuda", "__glibc"]
256
257
  for error in output.get("solver_problems", []):
258
+ # raise a specific error message for virtual package related errors
259
+ match = next((p for p in v_pkgs if p in error), None)
260
+ if match is not None:
261
+ vpkg_name = match[2:]
262
+ # try to strip version from error msg which are of the format:
263
+ # nothing provides <__vpkg> >=2.17,<3.0.a0 needed by <pkg_name>
264
+ try:
265
+ vpkg_version = error[
266
+ len("nothing provides %s " % match) : error.index(
267
+ " needed by"
268
+ )
269
+ ]
270
+ except ValueError:
271
+ vpkg_version = None
272
+ raise MicromambaException(
273
+ "Please set the environment variable CONDA_OVERRIDE_{var} to a specific version{version} of {name}.\n"
274
+ "Here is an example of supplying environment variables through the command line -\n\n"
275
+ "CONDA_OVERRIDE_{var}=<{name}-version> python flow.py <args>".format(
276
+ var=vpkg_name.upper(),
277
+ version=(
278
+ "" if not vpkg_version else (" (%s)" % vpkg_version)
279
+ ),
280
+ name=vpkg_name,
281
+ ),
282
+ )
257
283
  err.append(error)
258
284
  raise MicromambaException(
259
285
  msg.format(