metaflow 2.11.1__py2.py3-none-any.whl → 2.11.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/flowspec.py +7 -3
- metaflow/metaflow_config.py +11 -1
- metaflow/parameters.py +6 -0
- metaflow/plugins/argo/argo_workflows.py +101 -23
- metaflow/plugins/aws/batch/batch.py +2 -0
- metaflow/plugins/aws/batch/batch_client.py +10 -2
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +28 -6
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +219 -4
- metaflow/plugins/aws/step_functions/step_functions_cli.py +104 -6
- metaflow/plugins/aws/step_functions/step_functions_client.py +8 -3
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -1
- metaflow/plugins/cards/card_cli.py +2 -2
- metaflow/plugins/kubernetes/kubernetes.py +2 -0
- metaflow/plugins/kubernetes/kubernetes_cli.py +3 -0
- metaflow/plugins/kubernetes/kubernetes_client.py +10 -2
- metaflow/plugins/kubernetes/kubernetes_decorator.py +17 -0
- metaflow/plugins/kubernetes/kubernetes_job.py +27 -0
- metaflow/plugins/pypi/bootstrap.py +1 -1
- metaflow/plugins/pypi/conda_decorator.py +21 -1
- metaflow/plugins/pypi/conda_environment.py +21 -4
- metaflow/version.py +1 -1
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/METADATA +2 -2
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/RECORD +28 -28
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/LICENSE +0 -0
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/WHEEL +0 -0
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/entry_points.txt +0 -0
- {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/top_level.txt +0 -0
metaflow/flowspec.py
CHANGED
@@ -17,7 +17,7 @@ from .exception import (
|
|
17
17
|
)
|
18
18
|
from .graph import FlowGraph
|
19
19
|
from .unbounded_foreach import UnboundedForeachInput
|
20
|
-
from .metaflow_config import INCLUDE_FOREACH_STACK
|
20
|
+
from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS
|
21
21
|
|
22
22
|
# For Python 3 compatibility
|
23
23
|
try:
|
@@ -28,6 +28,8 @@ except NameError:
|
|
28
28
|
|
29
29
|
from .datastore.inputs import Inputs
|
30
30
|
|
31
|
+
INTERNAL_ARTIFACTS_SET = set(["_foreach_values"])
|
32
|
+
|
31
33
|
|
32
34
|
class InvalidNextException(MetaflowException):
|
33
35
|
headline = "Invalid self.next() transition detected"
|
@@ -446,7 +448,9 @@ class FlowSpec(object):
|
|
446
448
|
available_vars = (
|
447
449
|
(var, sha)
|
448
450
|
for var, sha in inp._datastore.items()
|
449
|
-
if (var not in exclude)
|
451
|
+
if (var not in exclude)
|
452
|
+
and (not hasattr(self, var))
|
453
|
+
and (var not in INTERNAL_ARTIFACTS_SET)
|
450
454
|
)
|
451
455
|
for var, sha in available_vars:
|
452
456
|
_, previous_sha = to_merge.setdefault(var, (inp, sha))
|
@@ -529,7 +533,7 @@ class FlowSpec(object):
|
|
529
533
|
)
|
530
534
|
|
531
535
|
value = item if _is_primitive_type(item) else reprlib.Repr().repr(item)
|
532
|
-
return basestring(value)
|
536
|
+
return basestring(value)[:MAXIMUM_FOREACH_VALUE_CHARS]
|
533
537
|
|
534
538
|
def next(self, *dsts: Callable[..., None], **kwargs) -> None:
|
535
539
|
"""
|
metaflow/metaflow_config.py
CHANGED
@@ -205,6 +205,8 @@ DEFAULT_CONTAINER_REGISTRY = from_conf("DEFAULT_CONTAINER_REGISTRY")
|
|
205
205
|
# Controls whether to include foreach stack information in metadata.
|
206
206
|
# TODO(Darin, 05/01/24): Remove this flag once we are confident with this feature.
|
207
207
|
INCLUDE_FOREACH_STACK = from_conf("INCLUDE_FOREACH_STACK", False)
|
208
|
+
# Maximum length of the foreach value string to be stored in each ForeachFrame.
|
209
|
+
MAXIMUM_FOREACH_VALUE_CHARS = from_conf("MAXIMUM_FOREACH_VALUE_CHARS", 30)
|
208
210
|
|
209
211
|
###
|
210
212
|
# Organization customizations
|
@@ -268,7 +270,13 @@ SFN_STATE_MACHINE_PREFIX = from_conf("SFN_STATE_MACHINE_PREFIX")
|
|
268
270
|
# machine execution logs. This needs to be available when using the
|
269
271
|
# `step-functions create --log-execution-history` command.
|
270
272
|
SFN_EXECUTION_LOG_GROUP_ARN = from_conf("SFN_EXECUTION_LOG_GROUP_ARN")
|
271
|
-
|
273
|
+
# Amazon S3 path for storing the results of AWS Step Functions Distributed Map
|
274
|
+
SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH = from_conf(
|
275
|
+
"SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH",
|
276
|
+
os.path.join(DATASTORE_SYSROOT_S3, "sfn_distributed_map_output")
|
277
|
+
if DATASTORE_SYSROOT_S3
|
278
|
+
else None,
|
279
|
+
)
|
272
280
|
###
|
273
281
|
# Kubernetes configuration
|
274
282
|
###
|
@@ -299,6 +307,8 @@ KUBERNETES_CONTAINER_REGISTRY = from_conf(
|
|
299
307
|
)
|
300
308
|
# Toggle for trying to fetch EC2 instance metadata
|
301
309
|
KUBERNETES_FETCH_EC2_METADATA = from_conf("KUBERNETES_FETCH_EC2_METADATA", False)
|
310
|
+
# Shared memory in MB to use for this step
|
311
|
+
KUBERNETES_SHARED_MEMORY = from_conf("KUBERNETES_SHARED_MEMORY", None)
|
302
312
|
|
303
313
|
ARGO_WORKFLOWS_KUBERNETES_SECRETS = from_conf("ARGO_WORKFLOWS_KUBERNETES_SECRETS", "")
|
304
314
|
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP = from_conf("ARGO_WORKFLOWS_ENV_VARS_TO_SKIP", "")
|
metaflow/parameters.py
CHANGED
@@ -331,6 +331,12 @@ class Parameter(object):
|
|
331
331
|
"for string parameters." % name
|
332
332
|
)
|
333
333
|
|
334
|
+
def __repr__(self):
|
335
|
+
return "metaflow.Parameter(name=%s, kwargs=%s)" % (name, kwargs)
|
336
|
+
|
337
|
+
def __str__(self):
|
338
|
+
return "metaflow.Parameter(name=%s, kwargs=%s)" % (name, kwargs)
|
339
|
+
|
334
340
|
def option_kwargs(self, deploy_mode):
|
335
341
|
kwargs = self.kwargs
|
336
342
|
if isinstance(kwargs.get("default"), DeployTimeField) and not deploy_mode:
|
@@ -227,8 +227,8 @@ class ArgoWorkflows(object):
|
|
227
227
|
|
228
228
|
return schedule_deleted, sensor_deleted, workflow_deleted
|
229
229
|
|
230
|
-
@
|
231
|
-
def terminate(flow_name, name):
|
230
|
+
@classmethod
|
231
|
+
def terminate(cls, flow_name, name):
|
232
232
|
client = ArgoClient(namespace=KUBERNETES_NAMESPACE)
|
233
233
|
|
234
234
|
response = client.terminate_workflow(name)
|
@@ -1368,6 +1368,9 @@ class ArgoWorkflows(object):
|
|
1368
1368
|
tmpfs_size = resources["tmpfs_size"]
|
1369
1369
|
tmpfs_path = resources["tmpfs_path"]
|
1370
1370
|
tmpfs_tempdir = resources["tmpfs_tempdir"]
|
1371
|
+
# Set shared_memory to 0 if it isn't specified. This results
|
1372
|
+
# in Kubernetes using it's default value when the pod is created.
|
1373
|
+
shared_memory = resources.get("shared_memory", 0)
|
1371
1374
|
|
1372
1375
|
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
1373
1376
|
|
@@ -1412,6 +1415,7 @@ class ArgoWorkflows(object):
|
|
1412
1415
|
medium="Memory",
|
1413
1416
|
size_limit=tmpfs_size if tmpfs_enabled else 0,
|
1414
1417
|
)
|
1418
|
+
.empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
|
1415
1419
|
.pvc_volumes(resources.get("persistent_volume_claims"))
|
1416
1420
|
# Set node selectors
|
1417
1421
|
.node_selectors(resources.get("node_selector"))
|
@@ -1505,6 +1509,17 @@ class ArgoWorkflows(object):
|
|
1505
1509
|
if tmpfs_enabled
|
1506
1510
|
else []
|
1507
1511
|
)
|
1512
|
+
# Support shared_memory
|
1513
|
+
+ (
|
1514
|
+
[
|
1515
|
+
kubernetes_sdk.V1VolumeMount(
|
1516
|
+
name="dhsm",
|
1517
|
+
mount_path="/dev/shm",
|
1518
|
+
)
|
1519
|
+
]
|
1520
|
+
if shared_memory
|
1521
|
+
else []
|
1522
|
+
)
|
1508
1523
|
# Support persistent volume claims.
|
1509
1524
|
+ (
|
1510
1525
|
[
|
@@ -1525,7 +1540,6 @@ class ArgoWorkflows(object):
|
|
1525
1540
|
|
1526
1541
|
# Return exit hook templates for workflow execution notifications.
|
1527
1542
|
def _exit_hook_templates(self):
|
1528
|
-
# TODO: Add details to slack message
|
1529
1543
|
templates = []
|
1530
1544
|
if self.notify_on_error:
|
1531
1545
|
templates.append(self._slack_error_template())
|
@@ -1634,36 +1648,100 @@ class ArgoWorkflows(object):
|
|
1634
1648
|
|
1635
1649
|
return links
|
1636
1650
|
|
1651
|
+
def _get_slack_blocks(self, message):
|
1652
|
+
"""
|
1653
|
+
Use Slack's Block Kit to add general information about the environment and
|
1654
|
+
execution metadata, including a link to the UI and an optional message.
|
1655
|
+
"""
|
1656
|
+
ui_link = "%s%s/argo-{{workflow.name}}" % (UI_URL, self.flow.name)
|
1657
|
+
# fmt: off
|
1658
|
+
if getattr(current, "project_name", None):
|
1659
|
+
# Add @project metadata when available.
|
1660
|
+
environment_details_block = {
|
1661
|
+
"type": "section",
|
1662
|
+
"text": {
|
1663
|
+
"type": "mrkdwn",
|
1664
|
+
"text": ":metaflow: Environment details"
|
1665
|
+
},
|
1666
|
+
"fields": [
|
1667
|
+
{
|
1668
|
+
"type": "mrkdwn",
|
1669
|
+
"text": "*Project:* %s" % current.project_name
|
1670
|
+
},
|
1671
|
+
{
|
1672
|
+
"type": "mrkdwn",
|
1673
|
+
"text": "*Project Branch:* %s" % current.branch_name
|
1674
|
+
}
|
1675
|
+
]
|
1676
|
+
}
|
1677
|
+
else:
|
1678
|
+
environment_details_block = {
|
1679
|
+
"type": "section",
|
1680
|
+
"text": {
|
1681
|
+
"type": "mrkdwn",
|
1682
|
+
"text": ":metaflow: Environment details"
|
1683
|
+
}
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
blocks = [
|
1687
|
+
environment_details_block,
|
1688
|
+
{
|
1689
|
+
"type": "context",
|
1690
|
+
"elements": [
|
1691
|
+
{
|
1692
|
+
"type": "mrkdwn",
|
1693
|
+
"text": " :information_source: *<%s>*" % ui_link,
|
1694
|
+
}
|
1695
|
+
],
|
1696
|
+
},
|
1697
|
+
{
|
1698
|
+
"type": "divider"
|
1699
|
+
},
|
1700
|
+
]
|
1701
|
+
|
1702
|
+
if message:
|
1703
|
+
blocks += [
|
1704
|
+
{
|
1705
|
+
"type": "section",
|
1706
|
+
"text": {
|
1707
|
+
"type": "mrkdwn",
|
1708
|
+
"text": message
|
1709
|
+
}
|
1710
|
+
}
|
1711
|
+
]
|
1712
|
+
# fmt: on
|
1713
|
+
return blocks
|
1714
|
+
|
1637
1715
|
def _slack_error_template(self):
|
1638
1716
|
if self.notify_slack_webhook_url is None:
|
1639
1717
|
return None
|
1718
|
+
|
1719
|
+
message = (
|
1720
|
+
":rotating_light: _%s/argo-{{workflow.name}}_ failed!" % self.flow.name
|
1721
|
+
)
|
1722
|
+
payload = {"text": message}
|
1723
|
+
if UI_URL:
|
1724
|
+
blocks = self._get_slack_blocks(message)
|
1725
|
+
payload = {"text": message, "blocks": blocks}
|
1726
|
+
|
1640
1727
|
return Template("notify-slack-on-error").http(
|
1641
|
-
Http("POST")
|
1642
|
-
.url(self.notify_slack_webhook_url)
|
1643
|
-
.body(
|
1644
|
-
json.dumps(
|
1645
|
-
{
|
1646
|
-
"text": ":rotating_light: _%s/argo-{{workflow.name}}_ failed!"
|
1647
|
-
% self.flow.name
|
1648
|
-
}
|
1649
|
-
)
|
1650
|
-
)
|
1728
|
+
Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
|
1651
1729
|
)
|
1652
1730
|
|
1653
1731
|
def _slack_success_template(self):
|
1654
1732
|
if self.notify_slack_webhook_url is None:
|
1655
1733
|
return None
|
1734
|
+
|
1735
|
+
message = (
|
1736
|
+
":white_check_mark: _%s/argo-{{workflow.name}}_ succeeded!" % self.flow.name
|
1737
|
+
)
|
1738
|
+
payload = {"text": message}
|
1739
|
+
if UI_URL:
|
1740
|
+
blocks = self._get_slack_blocks(message)
|
1741
|
+
payload = {"text": message, "blocks": blocks}
|
1742
|
+
|
1656
1743
|
return Template("notify-slack-on-success").http(
|
1657
|
-
Http("POST")
|
1658
|
-
.url(self.notify_slack_webhook_url)
|
1659
|
-
.body(
|
1660
|
-
json.dumps(
|
1661
|
-
{
|
1662
|
-
"text": ":white_check_mark: _%s/argo-{{workflow.name}}_ succeeded!"
|
1663
|
-
% self.flow.name
|
1664
|
-
}
|
1665
|
-
)
|
1666
|
-
)
|
1744
|
+
Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
|
1667
1745
|
)
|
1668
1746
|
|
1669
1747
|
def _compile_sensor(self):
|
@@ -11,6 +11,7 @@ from metaflow.plugins.datatools.s3.s3tail import S3Tail
|
|
11
11
|
from metaflow.plugins.aws.aws_utils import sanitize_batch_tag
|
12
12
|
from metaflow.exception import MetaflowException
|
13
13
|
from metaflow.metaflow_config import (
|
14
|
+
OTEL_ENDPOINT,
|
14
15
|
SERVICE_INTERNAL_URL,
|
15
16
|
DATATOOLS_S3ROOT,
|
16
17
|
DATASTORE_SYSROOT_S3,
|
@@ -255,6 +256,7 @@ class Batch(object):
|
|
255
256
|
.environment_variable("METAFLOW_DEFAULT_DATASTORE", "s3")
|
256
257
|
.environment_variable("METAFLOW_DEFAULT_METADATA", DEFAULT_METADATA)
|
257
258
|
.environment_variable("METAFLOW_CARD_S3ROOT", CARD_S3ROOT)
|
259
|
+
.environment_variable("METAFLOW_OTEL_ENDPOINT", OTEL_ENDPOINT)
|
258
260
|
.environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "aws-batch")
|
259
261
|
)
|
260
262
|
|
@@ -271,7 +271,7 @@ class BatchJob(object):
|
|
271
271
|
{
|
272
272
|
"containerPath": "/dev/neuron{}".format(i),
|
273
273
|
"hostPath": "/dev/neuron{}".format(i),
|
274
|
-
"permissions": ["
|
274
|
+
"permissions": ["READ", "WRITE"],
|
275
275
|
}
|
276
276
|
)
|
277
277
|
|
@@ -344,7 +344,15 @@ class BatchJob(object):
|
|
344
344
|
"Invalid efa value: ({}) (should be 0 or greater)".format(efa)
|
345
345
|
)
|
346
346
|
else:
|
347
|
-
|
347
|
+
if "linuxParameters" not in job_definition["containerProperties"]:
|
348
|
+
job_definition["containerProperties"]["linuxParameters"] = {}
|
349
|
+
if (
|
350
|
+
"devices"
|
351
|
+
not in job_definition["containerProperties"]["linuxParameters"]
|
352
|
+
):
|
353
|
+
job_definition["containerProperties"]["linuxParameters"][
|
354
|
+
"devices"
|
355
|
+
] = []
|
348
356
|
if (num_parallel or 0) > 1:
|
349
357
|
# Multi-node parallel jobs require the container path and permissions explicitly specified in Job definition
|
350
358
|
for i in range(int(efa)):
|
@@ -1,5 +1,8 @@
|
|
1
1
|
import os
|
2
|
+
import time
|
3
|
+
|
2
4
|
import requests
|
5
|
+
|
3
6
|
from metaflow.metaflow_config import SFN_DYNAMO_DB_TABLE
|
4
7
|
|
5
8
|
|
@@ -25,12 +28,31 @@ class DynamoDbClient(object):
|
|
25
28
|
def save_parent_task_id_for_foreach_join(
|
26
29
|
self, foreach_split_task_id, foreach_join_parent_task_id
|
27
30
|
):
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
ex = None
|
32
|
+
for attempt in range(10):
|
33
|
+
try:
|
34
|
+
return self._client.update_item(
|
35
|
+
TableName=self.name,
|
36
|
+
Key={"pathspec": {"S": foreach_split_task_id}},
|
37
|
+
UpdateExpression="ADD parent_task_ids_for_foreach_join :val",
|
38
|
+
ExpressionAttributeValues={
|
39
|
+
":val": {"SS": [foreach_join_parent_task_id]}
|
40
|
+
},
|
41
|
+
)
|
42
|
+
except self._client.exceptions.ClientError as error:
|
43
|
+
ex = error
|
44
|
+
if (
|
45
|
+
error.response["Error"]["Code"]
|
46
|
+
== "ProvisionedThroughputExceededException"
|
47
|
+
):
|
48
|
+
# hopefully, enough time for AWS to scale up! otherwise
|
49
|
+
# ensure sufficient on-demand throughput for dynamo db
|
50
|
+
# is provisioned ahead of time
|
51
|
+
sleep_time = min((2**attempt) * 10, 60)
|
52
|
+
time.sleep(sleep_time)
|
53
|
+
else:
|
54
|
+
raise
|
55
|
+
raise ex
|
34
56
|
|
35
57
|
def get_parent_task_ids_for_foreach_join(self, foreach_split_task_id):
|
36
58
|
response = self._client.get_item(
|