ob-metaflow 2.12.30.2__py2.py3-none-any.whl → 2.13.6.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/cards.py +1 -0
- metaflow/cli.py +185 -717
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +51 -0
- metaflow/cli_components/run_cmds.py +362 -0
- metaflow/cli_components/step_cmd.py +176 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/cmd/develop/stub_generator.py +9 -2
- metaflow/datastore/flow_datastore.py +2 -2
- metaflow/decorators.py +63 -2
- metaflow/exception.py +8 -2
- metaflow/extension_support/plugins.py +42 -27
- metaflow/flowspec.py +176 -23
- metaflow/graph.py +28 -27
- metaflow/includefile.py +50 -22
- metaflow/lint.py +35 -20
- metaflow/metadata_provider/heartbeat.py +23 -8
- metaflow/metaflow_config.py +10 -1
- metaflow/multicore_utils.py +31 -14
- metaflow/package.py +17 -3
- metaflow/parameters.py +97 -25
- metaflow/plugins/__init__.py +22 -0
- metaflow/plugins/airflow/airflow.py +18 -17
- metaflow/plugins/airflow/airflow_cli.py +1 -0
- metaflow/plugins/argo/argo_client.py +0 -2
- metaflow/plugins/argo/argo_workflows.py +195 -132
- metaflow/plugins/argo/argo_workflows_cli.py +1 -1
- metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +51 -9
- metaflow/plugins/argo/jobset_input_paths.py +0 -1
- metaflow/plugins/aws/aws_utils.py +6 -1
- metaflow/plugins/aws/batch/batch_client.py +1 -3
- metaflow/plugins/aws/batch/batch_decorator.py +13 -13
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +33 -1
- metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +7 -9
- metaflow/plugins/cards/card_cli.py +7 -2
- metaflow/plugins/cards/card_creator.py +1 -0
- metaflow/plugins/cards/card_decorator.py +79 -8
- metaflow/plugins/cards/card_modules/basic.py +56 -5
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/components.py +64 -16
- metaflow/plugins/cards/card_modules/main.js +27 -25
- metaflow/plugins/cards/card_modules/test_cards.py +4 -4
- metaflow/plugins/cards/component_serializer.py +1 -1
- metaflow/plugins/datatools/s3/s3.py +12 -4
- metaflow/plugins/datatools/s3/s3op.py +3 -3
- metaflow/plugins/events_decorator.py +338 -186
- metaflow/plugins/kubernetes/kube_utils.py +84 -1
- metaflow/plugins/kubernetes/kubernetes.py +40 -92
- metaflow/plugins/kubernetes/kubernetes_cli.py +32 -7
- metaflow/plugins/kubernetes/kubernetes_decorator.py +76 -4
- metaflow/plugins/kubernetes/kubernetes_job.py +23 -20
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +41 -20
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/parallel_decorator.py +4 -1
- metaflow/plugins/project_decorator.py +33 -5
- metaflow/plugins/pypi/bootstrap.py +249 -81
- metaflow/plugins/pypi/conda_decorator.py +20 -10
- metaflow/plugins/pypi/conda_environment.py +83 -27
- metaflow/plugins/pypi/micromamba.py +82 -37
- metaflow/plugins/pypi/pip.py +9 -6
- metaflow/plugins/pypi/pypi_decorator.py +11 -9
- metaflow/plugins/pypi/utils.py +4 -2
- metaflow/plugins/timeout_decorator.py +2 -2
- metaflow/runner/click_api.py +240 -50
- metaflow/runner/deployer.py +1 -1
- metaflow/runner/deployer_impl.py +12 -11
- metaflow/runner/metaflow_runner.py +68 -34
- metaflow/runner/nbdeploy.py +2 -0
- metaflow/runner/nbrun.py +1 -1
- metaflow/runner/subprocess_manager.py +61 -10
- metaflow/runner/utils.py +208 -44
- metaflow/runtime.py +216 -112
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/tracing/tracing_modules.py +4 -1
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_decorators.py +563 -0
- metaflow/user_configs/config_options.py +548 -0
- metaflow/user_configs/config_parameters.py +436 -0
- metaflow/util.py +22 -0
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/METADATA +12 -3
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/RECORD +96 -84
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,16 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import random
|
|
5
|
-
import sys
|
|
6
4
|
import time
|
|
7
5
|
|
|
8
6
|
from metaflow.exception import MetaflowException
|
|
9
7
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
10
8
|
from metaflow.tracing import inject_tracing_vars
|
|
11
|
-
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
12
9
|
from metaflow.metaflow_config_funcs import init_config
|
|
13
10
|
|
|
14
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
12
|
+
|
|
13
|
+
from .kube_utils import qos_requests_and_limits
|
|
15
14
|
from .kubernetes_jobsets import (
|
|
16
15
|
KubernetesJobSet,
|
|
17
16
|
) # We need this import for Kubernetes Client.
|
|
@@ -75,8 +74,14 @@ class KubernetesJob(object):
|
|
|
75
74
|
if self._kwargs["shared_memory"]
|
|
76
75
|
else None
|
|
77
76
|
)
|
|
77
|
+
qos_requests, qos_limits = qos_requests_and_limits(
|
|
78
|
+
self._kwargs["qos"],
|
|
79
|
+
self._kwargs["cpu"],
|
|
80
|
+
self._kwargs["memory"],
|
|
81
|
+
self._kwargs["disk"],
|
|
82
|
+
)
|
|
78
83
|
initial_configs = init_config()
|
|
79
|
-
for entry in ["OBP_PERIMETER", "
|
|
84
|
+
for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
|
|
80
85
|
if entry not in initial_configs:
|
|
81
86
|
raise KubernetesJobException(
|
|
82
87
|
f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
|
|
@@ -84,8 +89,8 @@ class KubernetesJob(object):
|
|
|
84
89
|
|
|
85
90
|
additional_obp_configs = {
|
|
86
91
|
"OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
|
|
87
|
-
"
|
|
88
|
-
"
|
|
92
|
+
"OBP_INTEGRATIONS_URL": initial_configs[
|
|
93
|
+
"OBP_INTEGRATIONS_URL"
|
|
89
94
|
],
|
|
90
95
|
}
|
|
91
96
|
|
|
@@ -176,20 +181,18 @@ class KubernetesJob(object):
|
|
|
176
181
|
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
177
182
|
name=self._kwargs["step_name"].replace("_", "-"),
|
|
178
183
|
resources=client.V1ResourceRequirements(
|
|
179
|
-
requests=
|
|
180
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
181
|
-
"memory": "%sM" % str(self._kwargs["memory"]),
|
|
182
|
-
"ephemeral-storage": "%sM"
|
|
183
|
-
% str(self._kwargs["disk"]),
|
|
184
|
-
},
|
|
184
|
+
requests=qos_requests,
|
|
185
185
|
limits={
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
186
|
+
**qos_limits,
|
|
187
|
+
**{
|
|
188
|
+
"%s.com/gpu".lower()
|
|
189
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
190
|
+
self._kwargs["gpu"]
|
|
191
|
+
)
|
|
192
|
+
for k in [0]
|
|
193
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
194
|
+
if self._kwargs["gpu"] is not None
|
|
195
|
+
},
|
|
193
196
|
},
|
|
194
197
|
),
|
|
195
198
|
volume_mounts=(
|
|
@@ -445,7 +448,7 @@ class RunningJob(object):
|
|
|
445
448
|
def best_effort_kill():
|
|
446
449
|
try:
|
|
447
450
|
self.kill()
|
|
448
|
-
except Exception
|
|
451
|
+
except Exception:
|
|
449
452
|
pass
|
|
450
453
|
|
|
451
454
|
atexit.register(best_effort_kill)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import random
|
|
@@ -7,7 +6,8 @@ from collections import namedtuple
|
|
|
7
6
|
from metaflow.exception import MetaflowException
|
|
8
7
|
from metaflow.metaflow_config import KUBERNETES_JOBSET_GROUP, KUBERNETES_JOBSET_VERSION
|
|
9
8
|
from metaflow.tracing import inject_tracing_vars
|
|
10
|
-
|
|
9
|
+
|
|
10
|
+
from .kube_utils import qos_requests_and_limits
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class KubernetesJobsetException(MetaflowException):
|
|
@@ -255,7 +255,7 @@ class RunningJobSet(object):
|
|
|
255
255
|
def best_effort_kill():
|
|
256
256
|
try:
|
|
257
257
|
self.kill()
|
|
258
|
-
except Exception
|
|
258
|
+
except Exception:
|
|
259
259
|
pass
|
|
260
260
|
|
|
261
261
|
atexit.register(best_effort_kill)
|
|
@@ -340,7 +340,7 @@ class RunningJobSet(object):
|
|
|
340
340
|
stdout=True,
|
|
341
341
|
tty=False,
|
|
342
342
|
)
|
|
343
|
-
except Exception
|
|
343
|
+
except Exception:
|
|
344
344
|
with client.ApiClient() as api_client:
|
|
345
345
|
# If we are unable to kill the control pod then
|
|
346
346
|
# Delete the jobset to kill the subsequent pods.
|
|
@@ -554,7 +554,12 @@ class JobSetSpec(object):
|
|
|
554
554
|
if self._kwargs["shared_memory"]
|
|
555
555
|
else None
|
|
556
556
|
)
|
|
557
|
-
|
|
557
|
+
qos_requests, qos_limits = qos_requests_and_limits(
|
|
558
|
+
self._kwargs["qos"],
|
|
559
|
+
self._kwargs["cpu"],
|
|
560
|
+
self._kwargs["memory"],
|
|
561
|
+
self._kwargs["disk"],
|
|
562
|
+
)
|
|
558
563
|
return dict(
|
|
559
564
|
name=self.name,
|
|
560
565
|
template=client.api_client.ApiClient().sanitize_for_serialization(
|
|
@@ -653,21 +658,18 @@ class JobSetSpec(object):
|
|
|
653
658
|
"_", "-"
|
|
654
659
|
),
|
|
655
660
|
resources=client.V1ResourceRequirements(
|
|
656
|
-
requests=
|
|
657
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
658
|
-
"memory": "%sM"
|
|
659
|
-
% str(self._kwargs["memory"]),
|
|
660
|
-
"ephemeral-storage": "%sM"
|
|
661
|
-
% str(self._kwargs["disk"]),
|
|
662
|
-
},
|
|
661
|
+
requests=qos_requests,
|
|
663
662
|
limits={
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
663
|
+
**qos_limits,
|
|
664
|
+
**{
|
|
665
|
+
"%s.com/gpu".lower()
|
|
666
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
667
|
+
self._kwargs["gpu"]
|
|
668
|
+
)
|
|
669
|
+
for k in [0]
|
|
670
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
671
|
+
if self._kwargs["gpu"] is not None
|
|
672
|
+
},
|
|
671
673
|
},
|
|
672
674
|
),
|
|
673
675
|
volume_mounts=(
|
|
@@ -858,6 +860,16 @@ class KubernetesJobSet(object):
|
|
|
858
860
|
self._annotations = dict(self._annotations, **{name: value})
|
|
859
861
|
return self
|
|
860
862
|
|
|
863
|
+
def labels(self, labels):
|
|
864
|
+
for k, v in labels.items():
|
|
865
|
+
self.label(k, v)
|
|
866
|
+
return self
|
|
867
|
+
|
|
868
|
+
def annotations(self, annotations):
|
|
869
|
+
for k, v in annotations.items():
|
|
870
|
+
self.annotation(k, v)
|
|
871
|
+
return self
|
|
872
|
+
|
|
861
873
|
def secret(self, name):
|
|
862
874
|
self.worker.secret(name)
|
|
863
875
|
self.control.secret(name)
|
|
@@ -983,15 +995,24 @@ class KubernetesArgoJobSet(object):
|
|
|
983
995
|
self._labels = dict(self._labels, **{name: value})
|
|
984
996
|
return self
|
|
985
997
|
|
|
998
|
+
def labels(self, labels):
|
|
999
|
+
for k, v in labels.items():
|
|
1000
|
+
self.label(k, v)
|
|
1001
|
+
return self
|
|
1002
|
+
|
|
986
1003
|
def annotation(self, name, value):
|
|
987
1004
|
self.worker.annotation(name, value)
|
|
988
1005
|
self.control.annotation(name, value)
|
|
989
1006
|
self._annotations = dict(self._annotations, **{name: value})
|
|
990
1007
|
return self
|
|
991
1008
|
|
|
1009
|
+
def annotations(self, annotations):
|
|
1010
|
+
for k, v in annotations.items():
|
|
1011
|
+
self.annotation(k, v)
|
|
1012
|
+
return self
|
|
1013
|
+
|
|
992
1014
|
def dump(self):
|
|
993
1015
|
client = self._kubernetes_sdk
|
|
994
|
-
import json
|
|
995
1016
|
|
|
996
1017
|
data = json.dumps(
|
|
997
1018
|
client.ApiClient().sanitize_for_serialization(
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from metaflow._vendor import click
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from metaflow.tagging_util import validate_tags
|
|
4
|
+
from metaflow.metadata_provider import MetaDatum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@click.group()
|
|
8
|
+
def cli():
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@cli.group(help="Commands related to spot metadata.")
|
|
13
|
+
def spot_metadata():
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@spot_metadata.command(help="Record spot termination metadata for a task.")
|
|
18
|
+
@click.option(
|
|
19
|
+
"--run-id",
|
|
20
|
+
required=True,
|
|
21
|
+
help="Run ID for which metadata is to be recorded.",
|
|
22
|
+
)
|
|
23
|
+
@click.option(
|
|
24
|
+
"--step-name",
|
|
25
|
+
required=True,
|
|
26
|
+
help="Step Name for which metadata is to be recorded.",
|
|
27
|
+
)
|
|
28
|
+
@click.option(
|
|
29
|
+
"--task-id",
|
|
30
|
+
required=True,
|
|
31
|
+
help="Task ID for which metadata is to be recorded.",
|
|
32
|
+
)
|
|
33
|
+
@click.option(
|
|
34
|
+
"--termination-notice-time",
|
|
35
|
+
required=True,
|
|
36
|
+
help="Spot termination notice time.",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"--tag",
|
|
40
|
+
"tags",
|
|
41
|
+
multiple=True,
|
|
42
|
+
required=False,
|
|
43
|
+
default=None,
|
|
44
|
+
help="List of tags.",
|
|
45
|
+
)
|
|
46
|
+
@click.pass_obj
|
|
47
|
+
def record(obj, run_id, step_name, task_id, termination_notice_time, tags=None):
|
|
48
|
+
validate_tags(tags)
|
|
49
|
+
|
|
50
|
+
tag_list = list(tags) if tags else []
|
|
51
|
+
|
|
52
|
+
entries = [
|
|
53
|
+
MetaDatum(
|
|
54
|
+
field="spot-termination-received-at",
|
|
55
|
+
value=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
56
|
+
type="spot-termination-received-at",
|
|
57
|
+
tags=tag_list,
|
|
58
|
+
),
|
|
59
|
+
MetaDatum(
|
|
60
|
+
field="spot-termination-time",
|
|
61
|
+
value=termination_notice_time,
|
|
62
|
+
type="spot-termination-time",
|
|
63
|
+
tags=tag_list,
|
|
64
|
+
),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
obj.metadata.register_metadata(
|
|
68
|
+
run_id=run_id, step_name=step_name, task_id=task_id, metadata=entries
|
|
69
|
+
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
import signal
|
|
5
|
+
import requests
|
|
6
|
+
import subprocess
|
|
7
|
+
from multiprocessing import Process
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from metaflow.sidecar import MessageTypes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpotTerminationMonitorSidecar(object):
|
|
13
|
+
EC2_TYPE_URL = "http://169.254.169.254/latest/meta-data/instance-life-cycle"
|
|
14
|
+
METADATA_URL = "http://169.254.169.254/latest/meta-data/spot/termination-time"
|
|
15
|
+
TOKEN_URL = "http://169.254.169.254/latest/api/token"
|
|
16
|
+
POLL_INTERVAL = 5 # seconds
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.is_alive = True
|
|
20
|
+
self._process = None
|
|
21
|
+
self._token = None
|
|
22
|
+
self._token_expiry = 0
|
|
23
|
+
|
|
24
|
+
if self._is_aws_spot_instance():
|
|
25
|
+
self._process = Process(target=self._monitor_loop)
|
|
26
|
+
self._process.start()
|
|
27
|
+
|
|
28
|
+
def process_message(self, msg):
|
|
29
|
+
if msg.msg_type == MessageTypes.SHUTDOWN:
|
|
30
|
+
self.is_alive = False
|
|
31
|
+
if self._process:
|
|
32
|
+
self._process.terminate()
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def get_worker(cls):
|
|
36
|
+
return cls
|
|
37
|
+
|
|
38
|
+
def _get_imds_token(self):
|
|
39
|
+
current_time = time.time()
|
|
40
|
+
if current_time >= self._token_expiry - 60: # Refresh 60s before expiry
|
|
41
|
+
try:
|
|
42
|
+
response = requests.put(
|
|
43
|
+
url=self.TOKEN_URL,
|
|
44
|
+
headers={"X-aws-ec2-metadata-token-ttl-seconds": "300"},
|
|
45
|
+
timeout=1,
|
|
46
|
+
)
|
|
47
|
+
if response.status_code == 200:
|
|
48
|
+
self._token = response.text
|
|
49
|
+
self._token_expiry = current_time + 240 # Slightly less than TTL
|
|
50
|
+
except requests.exceptions.RequestException:
|
|
51
|
+
pass
|
|
52
|
+
return self._token
|
|
53
|
+
|
|
54
|
+
def _make_ec2_request(self, url, timeout):
|
|
55
|
+
token = self._get_imds_token()
|
|
56
|
+
headers = {"X-aws-ec2-metadata-token": token} if token else {}
|
|
57
|
+
response = requests.get(url=url, headers=headers, timeout=timeout)
|
|
58
|
+
return response
|
|
59
|
+
|
|
60
|
+
def _is_aws_spot_instance(self):
|
|
61
|
+
try:
|
|
62
|
+
response = self._make_ec2_request(url=self.EC2_TYPE_URL, timeout=1)
|
|
63
|
+
return response.status_code == 200 and response.text == "spot"
|
|
64
|
+
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
def _monitor_loop(self):
|
|
68
|
+
while self.is_alive:
|
|
69
|
+
try:
|
|
70
|
+
response = self._make_ec2_request(url=self.METADATA_URL, timeout=1)
|
|
71
|
+
if response.status_code == 200:
|
|
72
|
+
termination_time = response.text
|
|
73
|
+
self._emit_termination_metadata(termination_time)
|
|
74
|
+
os.kill(os.getppid(), signal.SIGTERM)
|
|
75
|
+
break
|
|
76
|
+
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
|
|
77
|
+
pass
|
|
78
|
+
time.sleep(self.POLL_INTERVAL)
|
|
79
|
+
|
|
80
|
+
def _emit_termination_metadata(self, termination_time):
|
|
81
|
+
flow_filename = os.getenv("METAFLOW_FLOW_FILENAME")
|
|
82
|
+
pathspec = os.getenv("MF_PATHSPEC")
|
|
83
|
+
_, run_id, step_name, task_id = pathspec.split("/")
|
|
84
|
+
retry_count = os.getenv("MF_ATTEMPT")
|
|
85
|
+
|
|
86
|
+
with open("/tmp/spot_termination_notice", "w") as fp:
|
|
87
|
+
fp.write(termination_time)
|
|
88
|
+
|
|
89
|
+
command = [
|
|
90
|
+
sys.executable,
|
|
91
|
+
f"/metaflow/{flow_filename}",
|
|
92
|
+
"spot-metadata",
|
|
93
|
+
"record",
|
|
94
|
+
"--run-id",
|
|
95
|
+
run_id,
|
|
96
|
+
"--step-name",
|
|
97
|
+
step_name,
|
|
98
|
+
"--task-id",
|
|
99
|
+
task_id,
|
|
100
|
+
"--termination-notice-time",
|
|
101
|
+
termination_time,
|
|
102
|
+
"--tag",
|
|
103
|
+
"attempt_id:{}".format(retry_count),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
107
|
+
|
|
108
|
+
if result.returncode != 0:
|
|
109
|
+
print(f"Failed to record spot termination metadata: {result.stderr}")
|
|
@@ -45,6 +45,8 @@ class ParallelDecorator(StepDecorator):
|
|
|
45
45
|
if ubf_context == UBF_CONTROL:
|
|
46
46
|
num_parallel = cli_args.task.ubf_iter.num_parallel
|
|
47
47
|
cli_args.command_options["num-parallel"] = str(num_parallel)
|
|
48
|
+
if os.environ.get("METAFLOW_RUNTIME_ENVIRONMENT", "local") == "local":
|
|
49
|
+
cli_args.command_options["split_index"] = "0"
|
|
48
50
|
|
|
49
51
|
def step_init(
|
|
50
52
|
self, flow, graph, step_name, decorators, environment, flow_datastore, logger
|
|
@@ -126,6 +128,8 @@ class ParallelDecorator(StepDecorator):
|
|
|
126
128
|
tags=["attempt_id:{0}".format(0)],
|
|
127
129
|
)
|
|
128
130
|
]
|
|
131
|
+
flow._control_task_is_mapper_zero = True
|
|
132
|
+
|
|
129
133
|
metadata.register_metadata(run_id, step_name, task_id, task_metadata_list)
|
|
130
134
|
|
|
131
135
|
def task_decorate(
|
|
@@ -221,7 +225,6 @@ def _local_multinode_control_task_step_func(
|
|
|
221
225
|
"%s/%s/%s" % (run_id, step_name, mapper_task_id)
|
|
222
226
|
for mapper_task_id in mapper_task_ids
|
|
223
227
|
]
|
|
224
|
-
flow._control_task_is_mapper_zero = True
|
|
225
228
|
|
|
226
229
|
# run the step function ourselves
|
|
227
230
|
os.environ["MF_PARALLEL_NODE_INDEX"] = "0"
|
|
@@ -72,7 +72,6 @@ class ProjectDecorator(FlowDecorator):
|
|
|
72
72
|
"""
|
|
73
73
|
|
|
74
74
|
name = "project"
|
|
75
|
-
defaults = {"name": None}
|
|
76
75
|
|
|
77
76
|
options = {
|
|
78
77
|
"production": dict(
|
|
@@ -91,19 +90,48 @@ class ProjectDecorator(FlowDecorator):
|
|
|
91
90
|
),
|
|
92
91
|
}
|
|
93
92
|
|
|
93
|
+
defaults = {"name": None, **{k: v["default"] for k, v in options.items()}}
|
|
94
|
+
|
|
94
95
|
def flow_init(
|
|
95
96
|
self, flow, graph, environment, flow_datastore, metadata, logger, echo, options
|
|
96
97
|
):
|
|
97
98
|
self._option_values = options
|
|
98
99
|
project_name = self.attributes.get("name")
|
|
100
|
+
for op in options:
|
|
101
|
+
if (
|
|
102
|
+
op in self._user_defined_attributes
|
|
103
|
+
and options[op] != self.defaults[op]
|
|
104
|
+
and self.attributes[op] != options[op]
|
|
105
|
+
):
|
|
106
|
+
# Exception if:
|
|
107
|
+
# - the user provides a value in the attributes field
|
|
108
|
+
# - AND the user provided a value in the command line (non default)
|
|
109
|
+
# - AND the values are different
|
|
110
|
+
# Note that this won't raise an error if the user provided the default
|
|
111
|
+
# value in the command line and provided one in attribute but although
|
|
112
|
+
# slightly inconsistent, it is not incorrect.
|
|
113
|
+
raise MetaflowException(
|
|
114
|
+
"You cannot pass %s as both a command-line argument and an attribute "
|
|
115
|
+
"of the @project decorator." % op
|
|
116
|
+
)
|
|
117
|
+
if "branch" in self._user_defined_attributes:
|
|
118
|
+
project_branch = self.attributes["branch"]
|
|
119
|
+
else:
|
|
120
|
+
project_branch = options["branch"]
|
|
121
|
+
|
|
122
|
+
if "production" in self._user_defined_attributes:
|
|
123
|
+
project_production = self.attributes["production"]
|
|
124
|
+
else:
|
|
125
|
+
project_production = options["production"]
|
|
126
|
+
|
|
99
127
|
project_flow_name, branch_name = format_name(
|
|
100
128
|
flow.name,
|
|
101
129
|
project_name,
|
|
102
|
-
|
|
103
|
-
|
|
130
|
+
project_production,
|
|
131
|
+
project_branch,
|
|
104
132
|
get_username(),
|
|
105
133
|
)
|
|
106
|
-
is_user_branch =
|
|
134
|
+
is_user_branch = project_branch is None and not project_production
|
|
107
135
|
echo(
|
|
108
136
|
"Project: *%s*, Branch: *%s*" % (project_name, branch_name),
|
|
109
137
|
fg="magenta",
|
|
@@ -114,7 +142,7 @@ class ProjectDecorator(FlowDecorator):
|
|
|
114
142
|
"project_name": project_name,
|
|
115
143
|
"branch_name": branch_name,
|
|
116
144
|
"is_user_branch": is_user_branch,
|
|
117
|
-
"is_production":
|
|
145
|
+
"is_production": project_production,
|
|
118
146
|
"project_flow_name": project_flow_name,
|
|
119
147
|
}
|
|
120
148
|
)
|