metaflow 2.11.1__py2.py3-none-any.whl → 2.11.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. metaflow/flowspec.py +7 -3
  2. metaflow/metaflow_config.py +11 -1
  3. metaflow/parameters.py +6 -0
  4. metaflow/plugins/argo/argo_workflows.py +101 -23
  5. metaflow/plugins/aws/batch/batch.py +2 -0
  6. metaflow/plugins/aws/batch/batch_client.py +10 -2
  7. metaflow/plugins/aws/step_functions/dynamo_db_client.py +28 -6
  8. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  9. metaflow/plugins/aws/step_functions/step_functions.py +219 -4
  10. metaflow/plugins/aws/step_functions/step_functions_cli.py +104 -6
  11. metaflow/plugins/aws/step_functions/step_functions_client.py +8 -3
  12. metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -1
  13. metaflow/plugins/cards/card_cli.py +2 -2
  14. metaflow/plugins/kubernetes/kubernetes.py +2 -0
  15. metaflow/plugins/kubernetes/kubernetes_cli.py +3 -0
  16. metaflow/plugins/kubernetes/kubernetes_client.py +10 -2
  17. metaflow/plugins/kubernetes/kubernetes_decorator.py +17 -0
  18. metaflow/plugins/kubernetes/kubernetes_job.py +27 -0
  19. metaflow/plugins/pypi/bootstrap.py +1 -1
  20. metaflow/plugins/pypi/conda_decorator.py +21 -1
  21. metaflow/plugins/pypi/conda_environment.py +21 -4
  22. metaflow/version.py +1 -1
  23. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/METADATA +2 -2
  24. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/RECORD +28 -28
  25. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/LICENSE +0 -0
  26. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/WHEEL +0 -0
  27. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/entry_points.txt +0 -0
  28. {metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/top_level.txt +0 -0
metaflow/flowspec.py CHANGED
@@ -17,7 +17,7 @@ from .exception import (
17
17
  )
18
18
  from .graph import FlowGraph
19
19
  from .unbounded_foreach import UnboundedForeachInput
20
- from .metaflow_config import INCLUDE_FOREACH_STACK
20
+ from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS
21
21
 
22
22
  # For Python 3 compatibility
23
23
  try:
@@ -28,6 +28,8 @@ except NameError:
28
28
 
29
29
  from .datastore.inputs import Inputs
30
30
 
31
+ INTERNAL_ARTIFACTS_SET = set(["_foreach_values"])
32
+
31
33
 
32
34
  class InvalidNextException(MetaflowException):
33
35
  headline = "Invalid self.next() transition detected"
@@ -446,7 +448,9 @@ class FlowSpec(object):
446
448
  available_vars = (
447
449
  (var, sha)
448
450
  for var, sha in inp._datastore.items()
449
- if (var not in exclude) and (not hasattr(self, var))
451
+ if (var not in exclude)
452
+ and (not hasattr(self, var))
453
+ and (var not in INTERNAL_ARTIFACTS_SET)
450
454
  )
451
455
  for var, sha in available_vars:
452
456
  _, previous_sha = to_merge.setdefault(var, (inp, sha))
@@ -529,7 +533,7 @@ class FlowSpec(object):
529
533
  )
530
534
 
531
535
  value = item if _is_primitive_type(item) else reprlib.Repr().repr(item)
532
- return basestring(value)
536
+ return basestring(value)[:MAXIMUM_FOREACH_VALUE_CHARS]
533
537
 
534
538
  def next(self, *dsts: Callable[..., None], **kwargs) -> None:
535
539
  """
@@ -205,6 +205,8 @@ DEFAULT_CONTAINER_REGISTRY = from_conf("DEFAULT_CONTAINER_REGISTRY")
205
205
  # Controls whether to include foreach stack information in metadata.
206
206
  # TODO(Darin, 05/01/24): Remove this flag once we are confident with this feature.
207
207
  INCLUDE_FOREACH_STACK = from_conf("INCLUDE_FOREACH_STACK", False)
208
+ # Maximum length of the foreach value string to be stored in each ForeachFrame.
209
+ MAXIMUM_FOREACH_VALUE_CHARS = from_conf("MAXIMUM_FOREACH_VALUE_CHARS", 30)
208
210
 
209
211
  ###
210
212
  # Organization customizations
@@ -268,7 +270,13 @@ SFN_STATE_MACHINE_PREFIX = from_conf("SFN_STATE_MACHINE_PREFIX")
268
270
  # machine execution logs. This needs to be available when using the
269
271
  # `step-functions create --log-execution-history` command.
270
272
  SFN_EXECUTION_LOG_GROUP_ARN = from_conf("SFN_EXECUTION_LOG_GROUP_ARN")
271
-
273
+ # Amazon S3 path for storing the results of AWS Step Functions Distributed Map
274
+ SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH = from_conf(
275
+ "SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH",
276
+ os.path.join(DATASTORE_SYSROOT_S3, "sfn_distributed_map_output")
277
+ if DATASTORE_SYSROOT_S3
278
+ else None,
279
+ )
272
280
  ###
273
281
  # Kubernetes configuration
274
282
  ###
@@ -299,6 +307,8 @@ KUBERNETES_CONTAINER_REGISTRY = from_conf(
299
307
  )
300
308
  # Toggle for trying to fetch EC2 instance metadata
301
309
  KUBERNETES_FETCH_EC2_METADATA = from_conf("KUBERNETES_FETCH_EC2_METADATA", False)
310
+ # Shared memory in MB to use for this step
311
+ KUBERNETES_SHARED_MEMORY = from_conf("KUBERNETES_SHARED_MEMORY", None)
302
312
 
303
313
  ARGO_WORKFLOWS_KUBERNETES_SECRETS = from_conf("ARGO_WORKFLOWS_KUBERNETES_SECRETS", "")
304
314
  ARGO_WORKFLOWS_ENV_VARS_TO_SKIP = from_conf("ARGO_WORKFLOWS_ENV_VARS_TO_SKIP", "")
metaflow/parameters.py CHANGED
@@ -331,6 +331,12 @@ class Parameter(object):
331
331
  "for string parameters." % name
332
332
  )
333
333
 
334
+ def __repr__(self):
335
+ return "metaflow.Parameter(name=%s, kwargs=%s)" % (name, kwargs)
336
+
337
+ def __str__(self):
338
+ return "metaflow.Parameter(name=%s, kwargs=%s)" % (name, kwargs)
339
+
334
340
  def option_kwargs(self, deploy_mode):
335
341
  kwargs = self.kwargs
336
342
  if isinstance(kwargs.get("default"), DeployTimeField) and not deploy_mode:
@@ -227,8 +227,8 @@ class ArgoWorkflows(object):
227
227
 
228
228
  return schedule_deleted, sensor_deleted, workflow_deleted
229
229
 
230
- @staticmethod
231
- def terminate(flow_name, name):
230
+ @classmethod
231
+ def terminate(cls, flow_name, name):
232
232
  client = ArgoClient(namespace=KUBERNETES_NAMESPACE)
233
233
 
234
234
  response = client.terminate_workflow(name)
@@ -1368,6 +1368,9 @@ class ArgoWorkflows(object):
1368
1368
  tmpfs_size = resources["tmpfs_size"]
1369
1369
  tmpfs_path = resources["tmpfs_path"]
1370
1370
  tmpfs_tempdir = resources["tmpfs_tempdir"]
1371
+ # Set shared_memory to 0 if it isn't specified. This results
1372
+ # in Kubernetes using it's default value when the pod is created.
1373
+ shared_memory = resources.get("shared_memory", 0)
1371
1374
 
1372
1375
  tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
1373
1376
 
@@ -1412,6 +1415,7 @@ class ArgoWorkflows(object):
1412
1415
  medium="Memory",
1413
1416
  size_limit=tmpfs_size if tmpfs_enabled else 0,
1414
1417
  )
1418
+ .empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
1415
1419
  .pvc_volumes(resources.get("persistent_volume_claims"))
1416
1420
  # Set node selectors
1417
1421
  .node_selectors(resources.get("node_selector"))
@@ -1505,6 +1509,17 @@ class ArgoWorkflows(object):
1505
1509
  if tmpfs_enabled
1506
1510
  else []
1507
1511
  )
1512
+ # Support shared_memory
1513
+ + (
1514
+ [
1515
+ kubernetes_sdk.V1VolumeMount(
1516
+ name="dhsm",
1517
+ mount_path="/dev/shm",
1518
+ )
1519
+ ]
1520
+ if shared_memory
1521
+ else []
1522
+ )
1508
1523
  # Support persistent volume claims.
1509
1524
  + (
1510
1525
  [
@@ -1525,7 +1540,6 @@ class ArgoWorkflows(object):
1525
1540
 
1526
1541
  # Return exit hook templates for workflow execution notifications.
1527
1542
  def _exit_hook_templates(self):
1528
- # TODO: Add details to slack message
1529
1543
  templates = []
1530
1544
  if self.notify_on_error:
1531
1545
  templates.append(self._slack_error_template())
@@ -1634,36 +1648,100 @@ class ArgoWorkflows(object):
1634
1648
 
1635
1649
  return links
1636
1650
 
1651
+ def _get_slack_blocks(self, message):
1652
+ """
1653
+ Use Slack's Block Kit to add general information about the environment and
1654
+ execution metadata, including a link to the UI and an optional message.
1655
+ """
1656
+ ui_link = "%s%s/argo-{{workflow.name}}" % (UI_URL, self.flow.name)
1657
+ # fmt: off
1658
+ if getattr(current, "project_name", None):
1659
+ # Add @project metadata when available.
1660
+ environment_details_block = {
1661
+ "type": "section",
1662
+ "text": {
1663
+ "type": "mrkdwn",
1664
+ "text": ":metaflow: Environment details"
1665
+ },
1666
+ "fields": [
1667
+ {
1668
+ "type": "mrkdwn",
1669
+ "text": "*Project:* %s" % current.project_name
1670
+ },
1671
+ {
1672
+ "type": "mrkdwn",
1673
+ "text": "*Project Branch:* %s" % current.branch_name
1674
+ }
1675
+ ]
1676
+ }
1677
+ else:
1678
+ environment_details_block = {
1679
+ "type": "section",
1680
+ "text": {
1681
+ "type": "mrkdwn",
1682
+ "text": ":metaflow: Environment details"
1683
+ }
1684
+ }
1685
+
1686
+ blocks = [
1687
+ environment_details_block,
1688
+ {
1689
+ "type": "context",
1690
+ "elements": [
1691
+ {
1692
+ "type": "mrkdwn",
1693
+ "text": " :information_source: *<%s>*" % ui_link,
1694
+ }
1695
+ ],
1696
+ },
1697
+ {
1698
+ "type": "divider"
1699
+ },
1700
+ ]
1701
+
1702
+ if message:
1703
+ blocks += [
1704
+ {
1705
+ "type": "section",
1706
+ "text": {
1707
+ "type": "mrkdwn",
1708
+ "text": message
1709
+ }
1710
+ }
1711
+ ]
1712
+ # fmt: on
1713
+ return blocks
1714
+
1637
1715
  def _slack_error_template(self):
1638
1716
  if self.notify_slack_webhook_url is None:
1639
1717
  return None
1718
+
1719
+ message = (
1720
+ ":rotating_light: _%s/argo-{{workflow.name}}_ failed!" % self.flow.name
1721
+ )
1722
+ payload = {"text": message}
1723
+ if UI_URL:
1724
+ blocks = self._get_slack_blocks(message)
1725
+ payload = {"text": message, "blocks": blocks}
1726
+
1640
1727
  return Template("notify-slack-on-error").http(
1641
- Http("POST")
1642
- .url(self.notify_slack_webhook_url)
1643
- .body(
1644
- json.dumps(
1645
- {
1646
- "text": ":rotating_light: _%s/argo-{{workflow.name}}_ failed!"
1647
- % self.flow.name
1648
- }
1649
- )
1650
- )
1728
+ Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
1651
1729
  )
1652
1730
 
1653
1731
  def _slack_success_template(self):
1654
1732
  if self.notify_slack_webhook_url is None:
1655
1733
  return None
1734
+
1735
+ message = (
1736
+ ":white_check_mark: _%s/argo-{{workflow.name}}_ succeeded!" % self.flow.name
1737
+ )
1738
+ payload = {"text": message}
1739
+ if UI_URL:
1740
+ blocks = self._get_slack_blocks(message)
1741
+ payload = {"text": message, "blocks": blocks}
1742
+
1656
1743
  return Template("notify-slack-on-success").http(
1657
- Http("POST")
1658
- .url(self.notify_slack_webhook_url)
1659
- .body(
1660
- json.dumps(
1661
- {
1662
- "text": ":white_check_mark: _%s/argo-{{workflow.name}}_ succeeded!"
1663
- % self.flow.name
1664
- }
1665
- )
1666
- )
1744
+ Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
1667
1745
  )
1668
1746
 
1669
1747
  def _compile_sensor(self):
@@ -11,6 +11,7 @@ from metaflow.plugins.datatools.s3.s3tail import S3Tail
11
11
  from metaflow.plugins.aws.aws_utils import sanitize_batch_tag
12
12
  from metaflow.exception import MetaflowException
13
13
  from metaflow.metaflow_config import (
14
+ OTEL_ENDPOINT,
14
15
  SERVICE_INTERNAL_URL,
15
16
  DATATOOLS_S3ROOT,
16
17
  DATASTORE_SYSROOT_S3,
@@ -255,6 +256,7 @@ class Batch(object):
255
256
  .environment_variable("METAFLOW_DEFAULT_DATASTORE", "s3")
256
257
  .environment_variable("METAFLOW_DEFAULT_METADATA", DEFAULT_METADATA)
257
258
  .environment_variable("METAFLOW_CARD_S3ROOT", CARD_S3ROOT)
259
+ .environment_variable("METAFLOW_OTEL_ENDPOINT", OTEL_ENDPOINT)
258
260
  .environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "aws-batch")
259
261
  )
260
262
 
@@ -271,7 +271,7 @@ class BatchJob(object):
271
271
  {
272
272
  "containerPath": "/dev/neuron{}".format(i),
273
273
  "hostPath": "/dev/neuron{}".format(i),
274
- "permissions": ["read", "write"],
274
+ "permissions": ["READ", "WRITE"],
275
275
  }
276
276
  )
277
277
 
@@ -344,7 +344,15 @@ class BatchJob(object):
344
344
  "Invalid efa value: ({}) (should be 0 or greater)".format(efa)
345
345
  )
346
346
  else:
347
- job_definition["containerProperties"]["linuxParameters"]["devices"] = []
347
+ if "linuxParameters" not in job_definition["containerProperties"]:
348
+ job_definition["containerProperties"]["linuxParameters"] = {}
349
+ if (
350
+ "devices"
351
+ not in job_definition["containerProperties"]["linuxParameters"]
352
+ ):
353
+ job_definition["containerProperties"]["linuxParameters"][
354
+ "devices"
355
+ ] = []
348
356
  if (num_parallel or 0) > 1:
349
357
  # Multi-node parallel jobs require the container path and permissions explicitly specified in Job definition
350
358
  for i in range(int(efa)):
@@ -1,5 +1,8 @@
1
1
  import os
2
+ import time
3
+
2
4
  import requests
5
+
3
6
  from metaflow.metaflow_config import SFN_DYNAMO_DB_TABLE
4
7
 
5
8
 
@@ -25,12 +28,31 @@ class DynamoDbClient(object):
25
28
  def save_parent_task_id_for_foreach_join(
26
29
  self, foreach_split_task_id, foreach_join_parent_task_id
27
30
  ):
28
- return self._client.update_item(
29
- TableName=self.name,
30
- Key={"pathspec": {"S": foreach_split_task_id}},
31
- UpdateExpression="ADD parent_task_ids_for_foreach_join :val",
32
- ExpressionAttributeValues={":val": {"SS": [foreach_join_parent_task_id]}},
33
- )
31
+ ex = None
32
+ for attempt in range(10):
33
+ try:
34
+ return self._client.update_item(
35
+ TableName=self.name,
36
+ Key={"pathspec": {"S": foreach_split_task_id}},
37
+ UpdateExpression="ADD parent_task_ids_for_foreach_join :val",
38
+ ExpressionAttributeValues={
39
+ ":val": {"SS": [foreach_join_parent_task_id]}
40
+ },
41
+ )
42
+ except self._client.exceptions.ClientError as error:
43
+ ex = error
44
+ if (
45
+ error.response["Error"]["Code"]
46
+ == "ProvisionedThroughputExceededException"
47
+ ):
48
+ # hopefully, enough time for AWS to scale up! otherwise
49
+ # ensure sufficient on-demand throughput for dynamo db
50
+ # is provisioned ahead of time
51
+ sleep_time = min((2**attempt) * 10, 60)
52
+ time.sleep(sleep_time)
53
+ else:
54
+ raise
55
+ raise ex
34
56
 
35
57
  def get_parent_task_ids_for_foreach_join(self, foreach_split_task_id):
36
58
  response = self._client.get_item(
@@ -1,5 +1,5 @@
1
- import os
2
1
  import json
2
+ import os
3
3
  import random
4
4
  import string
5
5
  import zlib