ob-metaflow 2.12.10.1rc2__py2.py3-none-any.whl → 2.12.11.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (45) hide show
  1. metaflow/client/core.py +6 -6
  2. metaflow/client/filecache.py +16 -3
  3. metaflow/cmd/develop/stub_generator.py +62 -47
  4. metaflow/datastore/content_addressed_store.py +1 -1
  5. metaflow/datastore/task_datastore.py +1 -1
  6. metaflow/decorators.py +2 -4
  7. metaflow/extension_support/__init__.py +3 -3
  8. metaflow/extension_support/plugins.py +3 -3
  9. metaflow/metaflow_config.py +35 -18
  10. metaflow/parameters.py +3 -3
  11. metaflow/plugins/airflow/airflow.py +6 -6
  12. metaflow/plugins/airflow/airflow_utils.py +5 -3
  13. metaflow/plugins/argo/argo_workflows.py +555 -192
  14. metaflow/plugins/argo/argo_workflows_cli.py +27 -4
  15. metaflow/plugins/argo/argo_workflows_decorator.py +6 -13
  16. metaflow/plugins/argo/capture_error.py +70 -0
  17. metaflow/plugins/argo/daemon.py +59 -0
  18. metaflow/plugins/aws/step_functions/step_functions.py +3 -3
  19. metaflow/plugins/cards/card_modules/basic.py +5 -3
  20. metaflow/plugins/cards/card_modules/convert_to_native_type.py +2 -2
  21. metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
  22. metaflow/plugins/cards/card_modules/test_cards.py +0 -2
  23. metaflow/plugins/datastores/gs_storage.py +3 -10
  24. metaflow/plugins/datatools/s3/s3op.py +5 -3
  25. metaflow/plugins/kubernetes/kubernetes.py +1 -0
  26. metaflow/plugins/kubernetes/kubernetes_job.py +32 -42
  27. metaflow/plugins/kubernetes/kubernetes_jobsets.py +16 -14
  28. metaflow/plugins/logs_cli.py +1 -0
  29. metaflow/plugins/pypi/conda_environment.py +1 -3
  30. metaflow/plugins/pypi/pip.py +3 -3
  31. metaflow/plugins/storage_executor.py +1 -5
  32. metaflow/plugins/tag_cli.py +3 -3
  33. metaflow/procpoll.py +1 -1
  34. metaflow/runtime.py +1 -0
  35. metaflow/tracing/__init__.py +0 -5
  36. metaflow/tracing/tracing_modules.py +1 -4
  37. metaflow/util.py +6 -6
  38. metaflow/version.py +1 -1
  39. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/METADATA +2 -2
  40. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/RECORD +44 -43
  41. metaflow/tracing/threadpool.py +0 -30
  42. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/LICENSE +0 -0
  43. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/WHEEL +0 -0
  44. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/entry_points.txt +0 -0
  45. {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,14 @@ import re
5
5
  import sys
6
6
  from hashlib import sha1
7
7
 
8
- from metaflow import Run, JSONType, current, decorators, parameters
9
- from metaflow.client.core import get_metadata
10
- from metaflow.exception import MetaflowNotFound
8
+ from metaflow import JSONType, Run, current, decorators, parameters
11
9
  from metaflow._vendor import click
12
- from metaflow.exception import MetaflowException, MetaflowInternalError
10
+ from metaflow.client.core import get_metadata
11
+ from metaflow.exception import (
12
+ MetaflowException,
13
+ MetaflowInternalError,
14
+ MetaflowNotFound,
15
+ )
13
16
  from metaflow.metaflow_config import (
14
17
  ARGO_WORKFLOWS_UI_URL,
15
18
  KUBERNETES_NAMESPACE,
@@ -167,6 +170,12 @@ def argo_workflows(obj, name=None):
167
170
  default="",
168
171
  help="PagerDuty Events API V2 Integration key for workflow success/failure notifications.",
169
172
  )
173
+ @click.option(
174
+ "--enable-heartbeat-daemon/--no-enable-heartbeat-daemon",
175
+ default=False,
176
+ show_default=True,
177
+ help="Use a daemon container to broadcast heartbeats.",
178
+ )
170
179
  @click.option(
171
180
  "--deployer-attribute-file",
172
181
  default=None,
@@ -175,6 +184,12 @@ def argo_workflows(obj, name=None):
175
184
  help="Write the workflow name to the file specified. Used internally for Metaflow's Deployer API.",
176
185
  hidden=True,
177
186
  )
187
+ @click.option(
188
+ "--enable-error-msg-capture/--no-enable-error-msg-capture",
189
+ default=False,
190
+ show_default=True,
191
+ help="Capture stack trace of first failed task in exit hook.",
192
+ )
178
193
  @click.pass_obj
179
194
  def create(
180
195
  obj,
@@ -192,7 +207,9 @@ def create(
192
207
  notify_on_success=False,
193
208
  notify_slack_webhook_url=None,
194
209
  notify_pager_duty_integration_key=None,
210
+ enable_heartbeat_daemon=True,
195
211
  deployer_attribute_file=None,
212
+ enable_error_msg_capture=False,
196
213
  ):
197
214
  validate_tags(tags)
198
215
 
@@ -240,6 +257,8 @@ def create(
240
257
  notify_on_success,
241
258
  notify_slack_webhook_url,
242
259
  notify_pager_duty_integration_key,
260
+ enable_heartbeat_daemon,
261
+ enable_error_msg_capture,
243
262
  )
244
263
 
245
264
  if only_json:
@@ -412,6 +431,8 @@ def make_flow(
412
431
  notify_on_success,
413
432
  notify_slack_webhook_url,
414
433
  notify_pager_duty_integration_key,
434
+ enable_heartbeat_daemon,
435
+ enable_error_msg_capture,
415
436
  ):
416
437
  # TODO: Make this check less specific to Amazon S3 as we introduce
417
438
  # support for more cloud object stores.
@@ -474,6 +495,8 @@ def make_flow(
474
495
  notify_on_success=notify_on_success,
475
496
  notify_slack_webhook_url=notify_slack_webhook_url,
476
497
  notify_pager_duty_integration_key=notify_pager_duty_integration_key,
498
+ enable_heartbeat_daemon=enable_heartbeat_daemon,
499
+ enable_error_msg_capture=enable_error_msg_capture,
477
500
  )
478
501
 
479
502
 
@@ -54,7 +54,7 @@ class ArgoWorkflowsInternalDecorator(StepDecorator):
54
54
  "_", 1
55
55
  )[
56
56
  0
57
- ] # infer type from env var key
57
+ ], # infer type from env var key
58
58
  # Add more event metadata here in the future
59
59
  }
60
60
  )
@@ -108,18 +108,12 @@ class ArgoWorkflowsInternalDecorator(StepDecorator):
108
108
  # we run pods with a security context. We work around this constraint by
109
109
  # mounting an emptyDir volume.
110
110
  if graph[step_name].type == "foreach":
111
- # A DAGNode is considered a `parallel_step` if it is annotated by the @parallel decorator.
112
- # A DAGNode is considered a `parallel_foreach` if it contains a `num_parallel` kwarg provided to the
113
- # `next` method of that DAGNode.
114
- # At this moment in the code we care if a node is marked as a `parallel_foreach` so that we can pass down the
115
- # value of `num_parallel` to the subsequent steps.
116
- # For @parallel, the implmentation uses 1 jobset object. That one jobset
117
- # object internally creates 'num_parallel' jobs. So, we set foreach_num_splits
118
- # to 1 here for @parallel. The parallelism of jobset is handled in
119
- # kubernetes_job.py.
120
111
  if graph[step_name].parallel_foreach:
112
+ # If a node is marked as a `parallel_foreach`, pass down the value of
113
+ # `num_parallel` to the subsequent steps.
121
114
  with open("/mnt/out/num_parallel", "w") as f:
122
115
  json.dump(flow._parallel_ubf_iter.num_parallel, f)
116
+ # Set splits to 1 since parallelism is handled by JobSet.
123
117
  flow._foreach_num_splits = 1
124
118
  with open("/mnt/out/task_id_entropy", "w") as file:
125
119
  import uuid
@@ -131,10 +125,9 @@ class ArgoWorkflowsInternalDecorator(StepDecorator):
131
125
  with open("/mnt/out/split_cardinality", "w") as file:
132
126
  json.dump(flow._foreach_num_splits, file)
133
127
 
134
- # for steps that have a `@parallel` decorator set to them, we will be relying on Jobsets
128
+ # For steps that have a `@parallel` decorator set to them, we will be relying on Jobsets
135
129
  # to run the task. In this case, we cannot set anything in the
136
- # `/mnt/out` directory, since such form of output mounts are not available to jobset execution as
137
- # argo just treats it like A K8s resource that it throws in the cluster.
130
+ # `/mnt/out` directory, since such form of output mounts are not available to Jobset executions.
138
131
  if not graph[step_name].parallel_step:
139
132
  # Unfortunately, we can't always use pod names as task-ids since the pod names
140
133
  # are not static across retries. We write the task-id to a file that is read
@@ -0,0 +1,70 @@
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+ ###
6
+ # Algorithm to determine 1st error:
7
+ # ignore the failures where message = ""
8
+ # group the failures via templateName
9
+ # sort each group by finishedAt
10
+ # find the group for which the last finishedAt is earliest
11
+ # if the earliest message is "No more retries left" then
12
+ # get the n-1th message from that group
13
+ # else
14
+ # return the last message.
15
+ ###
16
+
17
+
18
+ def parse_workflow_failures():
19
+ failures = json.loads(
20
+ json.loads(os.getenv("METAFLOW_ARGO_WORKFLOW_FAILURES", "[]"), strict=False),
21
+ strict=False,
22
+ )
23
+ return [wf for wf in failures if wf.get("message")]
24
+
25
+
26
+ def group_failures_by_template(failures):
27
+ groups = {}
28
+ for failure in failures:
29
+ groups.setdefault(failure["templateName"], []).append(failure)
30
+ return groups
31
+
32
+
33
+ def sort_by_finished_at(items):
34
+ return sorted(
35
+ items, key=lambda x: datetime.strptime(x["finishedAt"], "%Y-%m-%dT%H:%M:%SZ")
36
+ )
37
+
38
+
39
+ def find_earliest_last_finished_group(groups):
40
+ return min(
41
+ groups,
42
+ key=lambda k: datetime.strptime(
43
+ groups[k][-1]["finishedAt"], "%Y-%m-%dT%H:%M:%SZ"
44
+ ),
45
+ )
46
+
47
+
48
+ def determine_first_error():
49
+ failures = parse_workflow_failures()
50
+ if not failures:
51
+ return None
52
+
53
+ grouped_failures = group_failures_by_template(failures)
54
+ for group in grouped_failures.values():
55
+ group.sort(
56
+ key=lambda x: datetime.strptime(x["finishedAt"], "%Y-%m-%dT%H:%M:%SZ")
57
+ )
58
+
59
+ earliest_group = grouped_failures[
60
+ find_earliest_last_finished_group(grouped_failures)
61
+ ]
62
+
63
+ if earliest_group[-1]["message"] == "No more retries left":
64
+ return earliest_group[-2]
65
+ return earliest_group[-1]
66
+
67
+
68
+ if __name__ == "__main__":
69
+ first_err = determine_first_error()
70
+ print(json.dumps(first_err, indent=2))
@@ -0,0 +1,59 @@
1
+ from collections import namedtuple
2
+ from time import sleep
3
+ from metaflow.metaflow_config import DEFAULT_METADATA
4
+ from metaflow.metaflow_environment import MetaflowEnvironment
5
+ from metaflow.plugins import METADATA_PROVIDERS
6
+ from metaflow._vendor import click
7
+
8
+
9
+ class CliState:
10
+ pass
11
+
12
+
13
+ @click.group()
14
+ @click.option("--flow_name", required=True)
15
+ @click.option("--run_id", required=True)
16
+ @click.option(
17
+ "--tag",
18
+ "tags",
19
+ multiple=True,
20
+ default=None,
21
+ help="Annotate all objects produced by Argo Workflows runs "
22
+ "with the given tag. You can specify this option multiple "
23
+ "times to attach multiple tags.",
24
+ )
25
+ @click.pass_context
26
+ def cli(ctx, flow_name, run_id, tags=None):
27
+ ctx.obj = CliState()
28
+ ctx.obj.flow_name = flow_name
29
+ ctx.obj.run_id = run_id
30
+ ctx.obj.tags = tags
31
+ # Use a dummy flow to initialize the environment and metadata service,
32
+ # as we only need a name for the flow object.
33
+ flow = namedtuple("DummyFlow", "name")
34
+ dummyflow = flow(flow_name)
35
+
36
+ # Initialize a proper metadata service instance
37
+ environment = MetaflowEnvironment(dummyflow)
38
+
39
+ ctx.obj.metadata = [m for m in METADATA_PROVIDERS if m.TYPE == DEFAULT_METADATA][0](
40
+ environment, dummyflow, None, None
41
+ )
42
+
43
+
44
+ @cli.command(help="start heartbeat process for a run")
45
+ @click.pass_obj
46
+ def heartbeat(obj):
47
+ # Try to register a run in case the start task has not taken care of it yet.
48
+ obj.metadata.register_run_id(obj.run_id, obj.tags)
49
+ # Start run heartbeat
50
+ obj.metadata.start_run_heartbeat(obj.flow_name, obj.run_id)
51
+ # Keepalive loop
52
+ while True:
53
+ # Do not pollute daemon logs with anything unnecessary,
54
+ # as they might be extremely long running.
55
+ sleep(10)
56
+
57
+
58
+ if __name__ == "__main__":
59
+ cli()
@@ -664,9 +664,9 @@ class StepFunctions(object):
664
664
  # input to those descendent tasks. We set and propagate the
665
665
  # task ids pointing to split_parents through every state.
666
666
  if any(self.graph[n].type == "foreach" for n in node.in_funcs):
667
- attrs[
668
- "split_parent_task_id_%s.$" % node.split_parents[-1]
669
- ] = "$.SplitParentTaskId"
667
+ attrs["split_parent_task_id_%s.$" % node.split_parents[-1]] = (
668
+ "$.SplitParentTaskId"
669
+ )
670
670
  for parent in node.split_parents[:-1]:
671
671
  if self.graph[parent].type == "foreach":
672
672
  attrs["split_parent_task_id_%s.$" % parent] = (
@@ -26,9 +26,11 @@ def transform_flow_graph(step_info):
26
26
  graph_dict[stepname] = {
27
27
  "type": node_to_type(step_info[stepname]["type"]),
28
28
  "box_next": step_info[stepname]["type"] not in ("linear", "join"),
29
- "box_ends": None
30
- if "matching_join" not in step_info[stepname]
31
- else step_info[stepname]["matching_join"],
29
+ "box_ends": (
30
+ None
31
+ if "matching_join" not in step_info[stepname]
32
+ else step_info[stepname]["matching_join"]
33
+ ),
32
34
  "next": step_info[stepname]["next"],
33
35
  "doc": step_info[stepname]["doc"],
34
36
  }
@@ -314,8 +314,8 @@ class TaskToDict:
314
314
  # If there is any form of TypeError or ValueError we set the column value to "Unsupported Type"
315
315
  # We also set columns which are have null values to "null" strings
316
316
  time_format = "%Y-%m-%dT%H:%M:%S%Z"
317
- truncate_long_objects = (
318
- lambda x: x.astype("string").str.slice(0, 30) + "..."
317
+ truncate_long_objects = lambda x: (
318
+ x.astype("string").str.slice(0, 30) + "..."
319
319
  if len(x) > 0 and x.astype("string").str.len().max() > 30
320
320
  else x.astype("string")
321
321
  )
@@ -40,6 +40,7 @@ def render_safely(func):
40
40
  This is a decorator that can be added to any `MetaflowCardComponent.render`
41
41
  The goal is to render subcomponents safely and ensure that they are JSON serializable.
42
42
  """
43
+
43
44
  # expects a renderer func
44
45
  def ret_func(self, *args, **kwargs):
45
46
  return _render_component_safely(self, func, True, *args, **kwargs)
@@ -138,7 +138,6 @@ class TestJSONComponent(MetaflowCardComponent):
138
138
 
139
139
 
140
140
  class TestRefreshCard(MetaflowCard):
141
-
142
141
  """
143
142
  This card takes no components and helps test the `current.card.refresh(data)` interface.
144
143
  """
@@ -178,7 +177,6 @@ def _component_values_to_hash(components):
178
177
 
179
178
 
180
179
  class TestRefreshComponentCard(MetaflowCard):
181
-
182
180
  """
183
181
  This card takes components and helps test the `current.card.components["A"].update()`
184
182
  interface
@@ -8,7 +8,6 @@ from tempfile import mkdtemp
8
8
 
9
9
  from metaflow.datastore.datastore_storage import DataStoreStorage, CloseAfterUse
10
10
  from metaflow.exception import MetaflowInternalError
11
- from metaflow.tracing import traced, tracing
12
11
  from metaflow.metaflow_config import (
13
12
  DATASTORE_SYSROOT_GS,
14
13
  ARTIFACT_LOCALROOT,
@@ -71,14 +70,12 @@ class _GSRootClient(object):
71
70
  """Drives GSStorage.is_file()"""
72
71
  try:
73
72
  blob = self.get_blob_client(path)
74
- with traced("exists", dict(path=path)):
75
- result = blob.exists()
73
+ result = blob.exists()
76
74
 
77
75
  return result
78
76
  except Exception as e:
79
77
  process_gs_exception(e)
80
78
 
81
- @tracing
82
79
  def list_content_single(self, path):
83
80
  """Drives GSStorage.list_content()"""
84
81
 
@@ -107,7 +104,6 @@ class _GSRootClient(object):
107
104
  except Exception as e:
108
105
  process_gs_exception(e)
109
106
 
110
- @tracing
111
107
  def save_bytes_single(
112
108
  self,
113
109
  path_tmpfile_metadata_triple,
@@ -123,12 +119,10 @@ class _GSRootClient(object):
123
119
  blob.metadata = {"metaflow-user-attributes": json.dumps(metadata)}
124
120
  from google.cloud.storage.retry import DEFAULT_RETRY
125
121
 
126
- with traced("upload_blob", dict(path=path)):
127
- blob.upload_from_filename(tmpfile, retry=DEFAULT_RETRY)
122
+ blob.upload_from_filename(tmpfile, retry=DEFAULT_RETRY)
128
123
  except Exception as e:
129
124
  process_gs_exception(e)
130
125
 
131
- @tracing
132
126
  def load_bytes_single(self, tmpdir, key):
133
127
  """Drives GSStorage.load_bytes()"""
134
128
  tmp_filename = os.path.join(tmpdir, str(uuid.uuid4()))
@@ -142,8 +136,7 @@ class _GSRootClient(object):
142
136
  metaflow_user_attributes = json.loads(
143
137
  blob.metadata["metaflow-user-attributes"]
144
138
  )
145
- with traced("download_blob", dict(path=key)):
146
- blob.download_to_filename(tmp_filename)
139
+ blob.download_to_filename(tmp_filename)
147
140
  except google.api_core.exceptions.NotFound:
148
141
  tmp_filename = None
149
142
  return key, tmp_filename, metaflow_user_attributes
@@ -1119,9 +1119,11 @@ def get(
1119
1119
  str(url.idx),
1120
1120
  url_quote(url.prefix).decode(encoding="utf-8"),
1121
1121
  url_quote(url.url).decode(encoding="utf-8"),
1122
- url_quote(url.range).decode(encoding="utf-8")
1123
- if url.range
1124
- else "<norange>",
1122
+ (
1123
+ url_quote(url.range).decode(encoding="utf-8")
1124
+ if url.range
1125
+ else "<norange>"
1126
+ ),
1125
1127
  ]
1126
1128
  )
1127
1129
  + "\n"
@@ -299,6 +299,7 @@ class Kubernetes(object):
299
299
 
300
300
  jobset.environment_variables_from_selectors(
301
301
  {
302
+ "METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
302
303
  "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
303
304
  "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
304
305
  "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
@@ -5,7 +5,6 @@ import random
5
5
  import sys
6
6
  import time
7
7
 
8
- from metaflow.tracing import inject_tracing_vars, tracing, traced
9
8
  from metaflow.exception import MetaflowException
10
9
  from metaflow.metaflow_config import KUBERNETES_SECRETS
11
10
  from metaflow.tracing import inject_tracing_vars
@@ -99,13 +98,16 @@ class KubernetesJob(object):
99
98
  containers=[
100
99
  client.V1Container(
101
100
  command=self._kwargs["command"],
102
- ports=[]
103
- if self._kwargs["port"] is None
104
- else [
105
- client.V1ContainerPort(
106
- container_port=int(self._kwargs["port"])
107
- )
108
- ],
101
+ termination_message_policy="FallbackToLogsOnError",
102
+ ports=(
103
+ []
104
+ if self._kwargs["port"] is None
105
+ else [
106
+ client.V1ContainerPort(
107
+ container_port=int(self._kwargs["port"])
108
+ )
109
+ ]
110
+ ),
109
111
  env=[
110
112
  client.V1EnvVar(name=k, value=str(v))
111
113
  for k, v in self._kwargs.get(
@@ -125,6 +127,7 @@ class KubernetesJob(object):
125
127
  ),
126
128
  )
127
129
  for k, v in {
130
+ "METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
128
131
  "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
129
132
  "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
130
133
  "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
@@ -257,7 +260,6 @@ class KubernetesJob(object):
257
260
  if self._kwargs["persistent_volume_claims"] is not None
258
261
  else []
259
262
  ),
260
- # TODO (savin): Set termination_message_policy
261
263
  ),
262
264
  ),
263
265
  )
@@ -300,19 +302,13 @@ class KubernetesJob(object):
300
302
  # achieve the guarantees that we are seeking.
301
303
  # https://github.com/kubernetes/enhancements/issues/1040
302
304
  # Hopefully, we will be able to get creative with kube-batch
303
-
304
- with traced(
305
- "submit_kubernetes_job",
306
- {"job": "%s/%s" % (self._job.metadata.name, self._kwargs["namespace"])},
307
- ):
308
-
309
- response = (
310
- client.BatchV1Api()
311
- .create_namespaced_job(
312
- body=self._job, namespace=self._kwargs["namespace"]
313
- )
314
- .to_dict()
305
+ response = (
306
+ client.BatchV1Api()
307
+ .create_namespaced_job(
308
+ body=self._job, namespace=self._kwargs["namespace"]
315
309
  )
310
+ .to_dict()
311
+ )
316
312
  return RunningJob(
317
313
  client=self._client,
318
314
  name=response["metadata"]["name"],
@@ -441,14 +437,11 @@ class RunningJob(object):
441
437
  def _fetch_job(self):
442
438
  client = self._client.get()
443
439
  try:
444
- with traced(
445
- "fetch_kubernetes_job", {"job": "%s/%s" % (self._name, self._namespace)}
446
- ):
447
- return (
448
- client.BatchV1Api()
449
- .read_namespaced_job(name=self._name, namespace=self._namespace)
450
- .to_dict()
451
- )
440
+ return (
441
+ client.BatchV1Api()
442
+ .read_namespaced_job(name=self._name, namespace=self._namespace)
443
+ .to_dict()
444
+ )
452
445
  except client.rest.ApiException as e:
453
446
  if e.status == 404:
454
447
  raise KubernetesJobException(
@@ -460,20 +453,17 @@ class RunningJob(object):
460
453
  def _fetch_pod(self):
461
454
  # Fetch pod metadata.
462
455
  client = self._client.get()
463
- with traced(
464
- "fetch_kubernetes_pod", {"job": "%s/%s" % (self._name, self._namespace)}
465
- ):
466
- pods = (
467
- client.CoreV1Api()
468
- .list_namespaced_pod(
469
- namespace=self._namespace,
470
- label_selector="job-name={}".format(self._name),
471
- )
472
- .to_dict()["items"]
456
+ pods = (
457
+ client.CoreV1Api()
458
+ .list_namespaced_pod(
459
+ namespace=self._namespace,
460
+ label_selector="job-name={}".format(self._name),
473
461
  )
474
- if pods:
475
- return pods[0]
476
- return {}
462
+ .to_dict()["items"]
463
+ )
464
+ if pods:
465
+ return pods[0]
466
+ return {}
477
467
 
478
468
  def kill(self):
479
469
  # Terminating a Kubernetes job is a bit tricky. Issuing a
@@ -52,8 +52,6 @@ def k8s_retry(deadline_seconds=60, max_backoff=32):
52
52
  return decorator
53
53
 
54
54
 
55
- CONTROL_JOB_NAME = "control"
56
-
57
55
  JobsetStatus = namedtuple(
58
56
  "JobsetStatus",
59
57
  [
@@ -586,13 +584,18 @@ class JobSetSpec(object):
586
584
  containers=[
587
585
  client.V1Container(
588
586
  command=self._kwargs["command"],
589
- ports=[]
590
- if self._kwargs["port"] is None
591
- else [
592
- client.V1ContainerPort(
593
- container_port=int(self._kwargs["port"])
594
- )
595
- ],
587
+ termination_message_policy="FallbackToLogsOnError",
588
+ ports=(
589
+ []
590
+ if self._kwargs["port"] is None
591
+ else [
592
+ client.V1ContainerPort(
593
+ container_port=int(
594
+ self._kwargs["port"]
595
+ )
596
+ )
597
+ ]
598
+ ),
596
599
  env=[
597
600
  client.V1EnvVar(name=k, value=str(v))
598
601
  for k, v in self._kwargs.get(
@@ -756,7 +759,6 @@ class JobSetSpec(object):
756
759
  is not None
757
760
  else []
758
761
  ),
759
- # TODO (savin): Set termination_message_policy
760
762
  ),
761
763
  ),
762
764
  ),
@@ -790,14 +792,14 @@ class KubernetesJobSet(object):
790
792
 
791
793
  self._jobset_control_addr = _make_domain_name(
792
794
  name,
793
- CONTROL_JOB_NAME,
795
+ "control",
794
796
  0,
795
797
  0,
796
798
  namespace,
797
799
  )
798
800
 
799
801
  self._control_spec = JobSetSpec(
800
- client.get(), name=CONTROL_JOB_NAME, namespace=namespace, **kwargs
802
+ client.get(), name="control", namespace=namespace, **kwargs
801
803
  )
802
804
  self._worker_spec = JobSetSpec(
803
805
  client.get(), name="worker", namespace=namespace, **kwargs
@@ -918,14 +920,14 @@ class KubernetesArgoJobSet(object):
918
920
 
919
921
  self._jobset_control_addr = _make_domain_name(
920
922
  name,
921
- CONTROL_JOB_NAME,
923
+ "control",
922
924
  0,
923
925
  0,
924
926
  namespace,
925
927
  )
926
928
 
927
929
  self._control_spec = JobSetSpec(
928
- kubernetes_sdk, name=CONTROL_JOB_NAME, namespace=namespace, **kwargs
930
+ kubernetes_sdk, name="control", namespace=namespace, **kwargs
929
931
  )
930
932
  self._worker_spec = JobSetSpec(
931
933
  kubernetes_sdk, name="worker", namespace=namespace, **kwargs
@@ -7,6 +7,7 @@ from ..datastore import TaskDataStoreSet, TaskDataStore
7
7
 
8
8
  from ..mflog import mflog, LOG_SOURCES
9
9
 
10
+
10
11
  # main motivation from https://github.com/pallets/click/issues/430
11
12
  # in order to support a default command being called for a Click group.
12
13
  #
@@ -301,9 +301,7 @@ class CondaEnvironment(MetaflowEnvironment):
301
301
  lambda f: lambda obj: (
302
302
  {k: f(f)(v) for k, v in sorted(obj.items())}
303
303
  if isinstance(obj, dict)
304
- else sorted([f(f)(e) for e in obj])
305
- if isinstance(obj, list)
306
- else obj
304
+ else sorted([f(f)(e) for e in obj]) if isinstance(obj, list) else obj
307
305
  )
308
306
  )
309
307
 
@@ -121,9 +121,9 @@ class Pip(object):
121
121
  res["url"] = "{vcs}+{url}@{commit_id}{subdir_str}".format(
122
122
  **vcs_info,
123
123
  **res,
124
- subdir_str="#subdirectory=%s" % subdirectory
125
- if subdirectory
126
- else ""
124
+ subdir_str=(
125
+ "#subdirectory=%s" % subdirectory if subdirectory else ""
126
+ )
127
127
  )
128
128
  # used to deduplicate the storage location in case wheel does not
129
129
  # build with enough unique identifiers.
@@ -6,7 +6,6 @@ import sys
6
6
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
7
7
 
8
8
  from metaflow.exception import MetaflowException
9
- from metaflow.tracing import TracedThreadPoolExecutor
10
9
 
11
10
  if sys.version_info[:2] < (3, 7):
12
11
  # in 3.6, Only BrokenProcessPool exists (there is no BrokenThreadPool)
@@ -133,10 +132,7 @@ class StorageExecutor(object):
133
132
  msg="Cannot use ProcessPoolExecutor because Python version is older than 3.7 and multiprocess start method has been set to something other than 'spawn'"
134
133
  )
135
134
  else:
136
- self._executor = TracedThreadPoolExecutor(
137
- "StorageExecutor", max_workers=threadpool_max_workers
138
- )
139
- # self._executor = ThreadPoolExecutor(max_workers=threadpool_max_workers)
135
+ self._executor = ThreadPoolExecutor(max_workers=threadpool_max_workers)
140
136
 
141
137
  def warm_up(self):
142
138
  # warm up at least one process or thread in the pool.