ob-metaflow 2.12.7.2__py2.py3-none-any.whl → 2.12.9.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +2 -0
- metaflow/cli.py +12 -4
- metaflow/extension_support/plugins.py +1 -0
- metaflow/flowspec.py +8 -1
- metaflow/lint.py +13 -0
- metaflow/metaflow_current.py +0 -8
- metaflow/plugins/__init__.py +12 -0
- metaflow/plugins/argo/argo_workflows.py +463 -42
- metaflow/plugins/argo/argo_workflows_cli.py +60 -3
- metaflow/plugins/argo/argo_workflows_decorator.py +38 -7
- metaflow/plugins/argo/argo_workflows_deployer.py +290 -0
- metaflow/plugins/argo/jobset_input_paths.py +16 -0
- metaflow/plugins/aws/batch/batch_decorator.py +16 -13
- metaflow/plugins/aws/step_functions/step_functions_cli.py +45 -3
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +251 -0
- metaflow/plugins/cards/card_cli.py +1 -1
- metaflow/plugins/kubernetes/kubernetes.py +279 -52
- metaflow/plugins/kubernetes/kubernetes_cli.py +26 -8
- metaflow/plugins/kubernetes/kubernetes_client.py +0 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +56 -44
- metaflow/plugins/kubernetes/kubernetes_job.py +6 -6
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +510 -272
- metaflow/plugins/parallel_decorator.py +108 -8
- metaflow/plugins/secrets/secrets_decorator.py +12 -3
- metaflow/plugins/test_unbounded_foreach_decorator.py +39 -4
- metaflow/runner/deployer.py +386 -0
- metaflow/runner/metaflow_runner.py +1 -20
- metaflow/runner/nbdeploy.py +130 -0
- metaflow/runner/nbrun.py +4 -28
- metaflow/runner/utils.py +49 -0
- metaflow/runtime.py +246 -134
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/RECORD +38 -32
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.7.2.dist-info → ob_metaflow-2.12.9.1.dist-info}/top_level.txt +0 -0
|
@@ -4,11 +4,13 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
import shlex
|
|
6
6
|
import sys
|
|
7
|
+
from typing import Tuple, List
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from hashlib import sha1
|
|
9
10
|
from math import inf
|
|
10
11
|
|
|
11
12
|
from metaflow import JSONType, current
|
|
13
|
+
from metaflow.graph import DAGNode
|
|
12
14
|
from metaflow.decorators import flow_decorators
|
|
13
15
|
from metaflow.exception import MetaflowException
|
|
14
16
|
from metaflow.includefile import FilePathClass
|
|
@@ -48,6 +50,7 @@ from metaflow.metaflow_config import (
|
|
|
48
50
|
UI_URL,
|
|
49
51
|
PAGERDUTY_TEMPLATE_URL,
|
|
50
52
|
)
|
|
53
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
51
54
|
from metaflow.metaflow_config_funcs import config_values
|
|
52
55
|
from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars
|
|
53
56
|
from metaflow.parameters import deploy_time_eval
|
|
@@ -55,6 +58,7 @@ from metaflow.plugins.kubernetes.kubernetes import (
|
|
|
55
58
|
parse_kube_keyvalue_list,
|
|
56
59
|
validate_kube_labels,
|
|
57
60
|
)
|
|
61
|
+
from metaflow.graph import FlowGraph
|
|
58
62
|
from metaflow.util import (
|
|
59
63
|
compress_list,
|
|
60
64
|
dict_to_cli_options,
|
|
@@ -62,6 +66,10 @@ from metaflow.util import (
|
|
|
62
66
|
to_camelcase,
|
|
63
67
|
to_unicode,
|
|
64
68
|
)
|
|
69
|
+
from metaflow.plugins.kubernetes.kubernetes_jobsets import (
|
|
70
|
+
KubernetesArgoJobSet,
|
|
71
|
+
)
|
|
72
|
+
|
|
65
73
|
from .argo_client import ArgoClient
|
|
66
74
|
|
|
67
75
|
|
|
@@ -82,14 +90,14 @@ class ArgoWorkflowsSchedulingException(MetaflowException):
|
|
|
82
90
|
# 5. Add Metaflow tags to labels/annotations.
|
|
83
91
|
# 6. Support Multi-cluster scheduling - https://github.com/argoproj/argo-workflows/issues/3523#issuecomment-792307297
|
|
84
92
|
# 7. Support R lang.
|
|
85
|
-
# 8. Ping @savin at slack.outerbounds.co for any feature request
|
|
93
|
+
# 8. Ping @savin at slack.outerbounds.co for any feature request
|
|
86
94
|
|
|
87
95
|
|
|
88
96
|
class ArgoWorkflows(object):
|
|
89
97
|
def __init__(
|
|
90
98
|
self,
|
|
91
99
|
name,
|
|
92
|
-
graph,
|
|
100
|
+
graph: FlowGraph,
|
|
93
101
|
flow,
|
|
94
102
|
code_package_sha,
|
|
95
103
|
code_package_url,
|
|
@@ -852,13 +860,13 @@ class ArgoWorkflows(object):
|
|
|
852
860
|
# Visit every node and yield the uber DAGTemplate(s).
|
|
853
861
|
def _dag_templates(self):
|
|
854
862
|
def _visit(
|
|
855
|
-
node,
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
863
|
+
node,
|
|
864
|
+
exit_node=None,
|
|
865
|
+
templates=None,
|
|
866
|
+
dag_tasks=None,
|
|
867
|
+
parent_foreach=None,
|
|
868
|
+
): # Returns Tuple[List[Template], List[DAGTask]]
|
|
869
|
+
""" """
|
|
862
870
|
# Every for-each node results in a separate subDAG and an equivalent
|
|
863
871
|
# DAGTemplate rooted at the child of the for-each node. Each DAGTemplate
|
|
864
872
|
# has a unique name - the top-level DAGTemplate is named as the name of
|
|
@@ -872,7 +880,6 @@ class ArgoWorkflows(object):
|
|
|
872
880
|
templates = []
|
|
873
881
|
if exit_node is not None and exit_node is node.name:
|
|
874
882
|
return templates, dag_tasks
|
|
875
|
-
|
|
876
883
|
if node.name == "start":
|
|
877
884
|
# Start node has no dependencies.
|
|
878
885
|
dag_task = DAGTask(self._sanitize(node.name)).template(
|
|
@@ -881,13 +888,86 @@ class ArgoWorkflows(object):
|
|
|
881
888
|
elif (
|
|
882
889
|
node.is_inside_foreach
|
|
883
890
|
and self.graph[node.in_funcs[0]].type == "foreach"
|
|
891
|
+
and not self.graph[node.in_funcs[0]].parallel_foreach
|
|
892
|
+
# We need to distinguish what is a "regular" foreach (i.e something that doesn't care about to gang semantics)
|
|
893
|
+
# vs what is a "num_parallel" based foreach (i.e. something that follows gang semantics.)
|
|
894
|
+
# A `regular` foreach is basically any arbitrary kind of foreach.
|
|
884
895
|
):
|
|
885
896
|
# Child of a foreach node needs input-paths as well as split-index
|
|
886
897
|
# This child is the first node of the sub workflow and has no dependency
|
|
898
|
+
|
|
887
899
|
parameters = [
|
|
888
900
|
Parameter("input-paths").value("{{inputs.parameters.input-paths}}"),
|
|
889
901
|
Parameter("split-index").value("{{inputs.parameters.split-index}}"),
|
|
890
902
|
]
|
|
903
|
+
dag_task = (
|
|
904
|
+
DAGTask(self._sanitize(node.name))
|
|
905
|
+
.template(self._sanitize(node.name))
|
|
906
|
+
.arguments(Arguments().parameters(parameters))
|
|
907
|
+
)
|
|
908
|
+
elif node.parallel_step:
|
|
909
|
+
# This is the step where the @parallel decorator is defined.
|
|
910
|
+
# Since this DAGTask will call the for the `resource` [based templates]
|
|
911
|
+
# (https://argo-workflows.readthedocs.io/en/stable/walk-through/kubernetes-resources/)
|
|
912
|
+
# we have certain constraints on the way we can pass information inside the Jobset manifest
|
|
913
|
+
# [All templates will have access](https://argo-workflows.readthedocs.io/en/stable/variables/#all-templates)
|
|
914
|
+
# to the `inputs.parameters` so we will pass down ANY/ALL information using the
|
|
915
|
+
# input parameters.
|
|
916
|
+
# We define the usual parameters like input-paths/split-index etc. but we will also
|
|
917
|
+
# define the following:
|
|
918
|
+
# - `workerCount`: parameter which will be used to determine the number of
|
|
919
|
+
# parallel worker jobs
|
|
920
|
+
# - `jobset-name`: parameter which will be used to determine the name of the jobset.
|
|
921
|
+
# This parameter needs to be dynamic so that when we have retries we don't
|
|
922
|
+
# end up using the name of the jobset again (if we do, it will crash since k8s wont allow duplicated job names)
|
|
923
|
+
# - `retryCount`: parameter which will be used to determine the number of retries
|
|
924
|
+
# This parameter will *only* be available within the container templates like we
|
|
925
|
+
# have it for all other DAGTasks and NOT for custom kubernetes resource templates.
|
|
926
|
+
# So as a work-around, we will set it as the `retryCount` parameter instead of
|
|
927
|
+
# setting it as a {{ retries }} in the CLI code. Once set as a input parameter,
|
|
928
|
+
# we can use it in the Jobset Manifest templates as `{{inputs.parameters.retryCount}}`
|
|
929
|
+
# - `task-id-entropy`: This is a parameter which will help derive task-ids and jobset names. This parameter
|
|
930
|
+
# contains the relevant amount of entropy to ensure that task-ids and jobset names
|
|
931
|
+
# are uniquish. We will also use this in the join task to construct the task-ids of
|
|
932
|
+
# all parallel tasks since the task-ids for parallel task are minted formulaically.
|
|
933
|
+
parameters = [
|
|
934
|
+
Parameter("input-paths").value("{{inputs.parameters.input-paths}}"),
|
|
935
|
+
Parameter("num-parallel").value(
|
|
936
|
+
"{{inputs.parameters.num-parallel}}"
|
|
937
|
+
),
|
|
938
|
+
Parameter("split-index").value("{{inputs.parameters.split-index}}"),
|
|
939
|
+
Parameter("task-id-entropy").value(
|
|
940
|
+
"{{inputs.parameters.task-id-entropy}}"
|
|
941
|
+
),
|
|
942
|
+
# we cant just use hyphens with sprig.
|
|
943
|
+
# https://github.com/argoproj/argo-workflows/issues/10567#issuecomment-1452410948
|
|
944
|
+
Parameter("workerCount").value(
|
|
945
|
+
"{{=sprig.int(sprig.sub(sprig.int(inputs.parameters['num-parallel']),1))}}"
|
|
946
|
+
),
|
|
947
|
+
]
|
|
948
|
+
if any(d.name == "retry" for d in node.decorators):
|
|
949
|
+
parameters.extend(
|
|
950
|
+
[
|
|
951
|
+
Parameter("retryCount").value("{{retries}}"),
|
|
952
|
+
# The job-setname needs to be unique for each retry
|
|
953
|
+
# and we cannot use the `generateName` field in the
|
|
954
|
+
# Jobset Manifest since we need to construct the subdomain
|
|
955
|
+
# and control pod domain name pre-hand. So we will use
|
|
956
|
+
# the retry count to ensure that the jobset name is unique
|
|
957
|
+
Parameter("jobset-name").value(
|
|
958
|
+
"js-{{inputs.parameters.task-id-entropy}}{{retries}}",
|
|
959
|
+
),
|
|
960
|
+
]
|
|
961
|
+
)
|
|
962
|
+
else:
|
|
963
|
+
parameters.extend(
|
|
964
|
+
[
|
|
965
|
+
Parameter("jobset-name").value(
|
|
966
|
+
"js-{{inputs.parameters.task-id-entropy}}",
|
|
967
|
+
)
|
|
968
|
+
]
|
|
969
|
+
)
|
|
970
|
+
|
|
891
971
|
dag_task = (
|
|
892
972
|
DAGTask(self._sanitize(node.name))
|
|
893
973
|
.template(self._sanitize(node.name))
|
|
@@ -947,8 +1027,8 @@ class ArgoWorkflows(object):
|
|
|
947
1027
|
.template(self._sanitize(node.name))
|
|
948
1028
|
.arguments(Arguments().parameters(parameters))
|
|
949
1029
|
)
|
|
950
|
-
dag_tasks.append(dag_task)
|
|
951
1030
|
|
|
1031
|
+
dag_tasks.append(dag_task)
|
|
952
1032
|
# End the workflow if we have reached the end of the flow
|
|
953
1033
|
if node.type == "end":
|
|
954
1034
|
return [
|
|
@@ -974,14 +1054,30 @@ class ArgoWorkflows(object):
|
|
|
974
1054
|
parent_foreach,
|
|
975
1055
|
)
|
|
976
1056
|
# For foreach nodes generate a new sub DAGTemplate
|
|
1057
|
+
# We do this for "regular" foreaches (ie. `self.next(self.a, foreach=)`)
|
|
977
1058
|
elif node.type == "foreach":
|
|
978
1059
|
foreach_template_name = self._sanitize(
|
|
979
1060
|
"%s-foreach-%s"
|
|
980
1061
|
% (
|
|
981
1062
|
node.name,
|
|
982
|
-
node.foreach_param
|
|
1063
|
+
"parallel" if node.parallel_foreach else node.foreach_param
|
|
1064
|
+
# Since foreach's are derived based on `self.next(self.a, foreach="<varname>")`
|
|
1065
|
+
# vs @parallel foreach are done based on `self.next(self.a, num_parallel="<some-number>")`,
|
|
1066
|
+
# we need to ensure that `foreach_template_name` suffix is appropriately set based on the kind
|
|
1067
|
+
# of foreach.
|
|
983
1068
|
)
|
|
984
1069
|
)
|
|
1070
|
+
|
|
1071
|
+
# There are two separate "DAGTask"s created for the foreach node.
|
|
1072
|
+
# - The first one is a "jump-off" DAGTask where we propagate the
|
|
1073
|
+
# input-paths and split-index. This thing doesn't create
|
|
1074
|
+
# any actual containers and it responsible for only propagating
|
|
1075
|
+
# the parameters.
|
|
1076
|
+
# - The DAGTask that follows first DAGTask is the one
|
|
1077
|
+
# that uses the ContainerTemplate. This DAGTask is named the same
|
|
1078
|
+
# thing as the foreach node. We will leverage a similar pattern for the
|
|
1079
|
+
# @parallel tasks.
|
|
1080
|
+
#
|
|
985
1081
|
foreach_task = (
|
|
986
1082
|
DAGTask(foreach_template_name)
|
|
987
1083
|
.dependencies([self._sanitize(node.name)])
|
|
@@ -1005,9 +1101,26 @@ class ArgoWorkflows(object):
|
|
|
1005
1101
|
if parent_foreach
|
|
1006
1102
|
else []
|
|
1007
1103
|
)
|
|
1104
|
+
+ (
|
|
1105
|
+
# Disabiguate parameters for a regular `foreach` vs a `@parallel` foreach
|
|
1106
|
+
[
|
|
1107
|
+
Parameter("num-parallel").value(
|
|
1108
|
+
"{{tasks.%s.outputs.parameters.num-parallel}}"
|
|
1109
|
+
% self._sanitize(node.name)
|
|
1110
|
+
),
|
|
1111
|
+
Parameter("task-id-entropy").value(
|
|
1112
|
+
"{{tasks.%s.outputs.parameters.task-id-entropy}}"
|
|
1113
|
+
% self._sanitize(node.name)
|
|
1114
|
+
),
|
|
1115
|
+
]
|
|
1116
|
+
if node.parallel_foreach
|
|
1117
|
+
else []
|
|
1118
|
+
)
|
|
1008
1119
|
)
|
|
1009
1120
|
)
|
|
1010
1121
|
.with_param(
|
|
1122
|
+
# For @parallel workloads `num-splits` will be explicitly set to one so that
|
|
1123
|
+
# we can piggyback on the current mechanism with which we leverage argo.
|
|
1011
1124
|
"{{tasks.%s.outputs.parameters.num-splits}}"
|
|
1012
1125
|
% self._sanitize(node.name)
|
|
1013
1126
|
)
|
|
@@ -1020,17 +1133,34 @@ class ArgoWorkflows(object):
|
|
|
1020
1133
|
[],
|
|
1021
1134
|
node.name,
|
|
1022
1135
|
)
|
|
1136
|
+
|
|
1137
|
+
# How do foreach's work on Argo:
|
|
1138
|
+
# Lets say you have the following dag: (start[sets `foreach="x"`]) --> (task-a [actual foreach]) --> (join) --> (end)
|
|
1139
|
+
# With argo we will :
|
|
1140
|
+
# (start [sets num-splits]) --> (task-a-foreach-(0,0) [dummy task]) --> (task-a) --> (join) --> (end)
|
|
1141
|
+
# The (task-a-foreach-(0,0) [dummy task]) propagates the values of the `split-index` and the input paths.
|
|
1142
|
+
# to the actual foreach task.
|
|
1023
1143
|
templates.append(
|
|
1024
1144
|
Template(foreach_template_name)
|
|
1025
1145
|
.inputs(
|
|
1026
1146
|
Inputs().parameters(
|
|
1027
1147
|
[Parameter("input-paths"), Parameter("split-index")]
|
|
1028
1148
|
+ ([Parameter("root-input-path")] if parent_foreach else [])
|
|
1149
|
+
+ (
|
|
1150
|
+
[
|
|
1151
|
+
Parameter("num-parallel"),
|
|
1152
|
+
Parameter("task-id-entropy"),
|
|
1153
|
+
# Parameter("workerCount")
|
|
1154
|
+
]
|
|
1155
|
+
if node.parallel_foreach
|
|
1156
|
+
else []
|
|
1157
|
+
)
|
|
1029
1158
|
)
|
|
1030
1159
|
)
|
|
1031
1160
|
.outputs(
|
|
1032
1161
|
Outputs().parameters(
|
|
1033
1162
|
[
|
|
1163
|
+
# non @parallel tasks set task-ids as outputs
|
|
1034
1164
|
Parameter("task-id").valueFrom(
|
|
1035
1165
|
{
|
|
1036
1166
|
"parameter": "{{tasks.%s.outputs.parameters.task-id}}"
|
|
@@ -1040,29 +1170,67 @@ class ArgoWorkflows(object):
|
|
|
1040
1170
|
}
|
|
1041
1171
|
)
|
|
1042
1172
|
]
|
|
1173
|
+
if not node.parallel_foreach
|
|
1174
|
+
else [
|
|
1175
|
+
# @parallel tasks set `task-id-entropy` and `num-parallel`
|
|
1176
|
+
# as outputs so task-ids can be derived in the join step.
|
|
1177
|
+
# Both of these values should be propagated from the
|
|
1178
|
+
# jobset labels.
|
|
1179
|
+
Parameter("num-parallel").valueFrom(
|
|
1180
|
+
{
|
|
1181
|
+
"parameter": "{{tasks.%s.outputs.parameters.num-parallel}}"
|
|
1182
|
+
% self._sanitize(
|
|
1183
|
+
self.graph[node.matching_join].in_funcs[0]
|
|
1184
|
+
)
|
|
1185
|
+
}
|
|
1186
|
+
),
|
|
1187
|
+
Parameter("task-id-entropy").valueFrom(
|
|
1188
|
+
{
|
|
1189
|
+
"parameter": "{{tasks.%s.outputs.parameters.task-id-entropy}}"
|
|
1190
|
+
% self._sanitize(
|
|
1191
|
+
self.graph[node.matching_join].in_funcs[0]
|
|
1192
|
+
)
|
|
1193
|
+
}
|
|
1194
|
+
),
|
|
1195
|
+
]
|
|
1043
1196
|
)
|
|
1044
1197
|
)
|
|
1045
1198
|
.dag(DAGTemplate().fail_fast().tasks(dag_tasks_1))
|
|
1046
1199
|
)
|
|
1200
|
+
|
|
1047
1201
|
join_foreach_task = (
|
|
1048
1202
|
DAGTask(self._sanitize(self.graph[node.matching_join].name))
|
|
1049
1203
|
.template(self._sanitize(self.graph[node.matching_join].name))
|
|
1050
1204
|
.dependencies([foreach_template_name])
|
|
1051
1205
|
.arguments(
|
|
1052
1206
|
Arguments().parameters(
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
"
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
"
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1207
|
+
(
|
|
1208
|
+
[
|
|
1209
|
+
Parameter("input-paths").value(
|
|
1210
|
+
"argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
|
|
1211
|
+
% (node.name, self._sanitize(node.name))
|
|
1212
|
+
),
|
|
1213
|
+
Parameter("split-cardinality").value(
|
|
1214
|
+
"{{tasks.%s.outputs.parameters.split-cardinality}}"
|
|
1215
|
+
% self._sanitize(node.name)
|
|
1216
|
+
),
|
|
1217
|
+
]
|
|
1218
|
+
if not node.parallel_foreach
|
|
1219
|
+
else [
|
|
1220
|
+
Parameter("num-parallel").value(
|
|
1221
|
+
"{{tasks.%s.outputs.parameters.num-parallel}}"
|
|
1222
|
+
% self._sanitize(node.name)
|
|
1223
|
+
),
|
|
1224
|
+
Parameter("task-id-entropy").value(
|
|
1225
|
+
"{{tasks.%s.outputs.parameters.task-id-entropy}}"
|
|
1226
|
+
% self._sanitize(node.name)
|
|
1227
|
+
),
|
|
1228
|
+
]
|
|
1229
|
+
)
|
|
1063
1230
|
+ (
|
|
1064
1231
|
[
|
|
1065
1232
|
Parameter("split-index").value(
|
|
1233
|
+
# TODO : Pass down these parameters to the jobset stuff.
|
|
1066
1234
|
"{{inputs.parameters.split-index}}"
|
|
1067
1235
|
),
|
|
1068
1236
|
Parameter("root-input-path").value(
|
|
@@ -1140,7 +1308,17 @@ class ArgoWorkflows(object):
|
|
|
1140
1308
|
# export input_paths as it is used multiple times in the container script
|
|
1141
1309
|
# and we do not want to repeat the values.
|
|
1142
1310
|
input_paths_expr = "export INPUT_PATHS=''"
|
|
1143
|
-
|
|
1311
|
+
# If node is not a start step or a @parallel join then we will set the input paths.
|
|
1312
|
+
# To set the input-paths as a parameter, we need to ensure that the node
|
|
1313
|
+
# is not (a start node or a parallel join node). Start nodes will have no
|
|
1314
|
+
# input paths and parallel join will derive input paths based on a
|
|
1315
|
+
# formulaic approach using `num-parallel` and `task-id-entropy`.
|
|
1316
|
+
if not (
|
|
1317
|
+
node.name == "start"
|
|
1318
|
+
or (node.type == "join" and self.graph[node.in_funcs[0]].parallel_step)
|
|
1319
|
+
):
|
|
1320
|
+
# For parallel joins we don't pass the INPUT_PATHS but are dynamically constructed.
|
|
1321
|
+
# So we don't need to set the input paths.
|
|
1144
1322
|
input_paths_expr = (
|
|
1145
1323
|
"export INPUT_PATHS={{inputs.parameters.input-paths}}"
|
|
1146
1324
|
)
|
|
@@ -1169,13 +1347,23 @@ class ArgoWorkflows(object):
|
|
|
1169
1347
|
task_idx,
|
|
1170
1348
|
]
|
|
1171
1349
|
)
|
|
1350
|
+
if node.parallel_step:
|
|
1351
|
+
task_str = "-".join(
|
|
1352
|
+
[
|
|
1353
|
+
"$TASK_ID_PREFIX",
|
|
1354
|
+
"{{inputs.parameters.task-id-entropy}}", # id_base is addition entropy to based on node-name of the workflow
|
|
1355
|
+
"$TASK_ID_SUFFIX",
|
|
1356
|
+
]
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
# Generated task_ids need to be non-numeric - see register_task_id in
|
|
1360
|
+
# service.py. We do so by prefixing `t-`
|
|
1361
|
+
_task_id_base = (
|
|
1362
|
+
"$(echo %s | md5sum | cut -d ' ' -f 1 | tail -c 9)" % task_str
|
|
1363
|
+
)
|
|
1364
|
+
task_str = "(t-%s)" % _task_id_base
|
|
1172
1365
|
|
|
1173
|
-
|
|
1174
|
-
# service.py. We do so by prefixing `t-`
|
|
1175
|
-
task_id_expr = (
|
|
1176
|
-
"export METAFLOW_TASK_ID="
|
|
1177
|
-
"(t-$(echo %s | md5sum | cut -d ' ' -f 1 | tail -c 9))" % task_str
|
|
1178
|
-
)
|
|
1366
|
+
task_id_expr = "export METAFLOW_TASK_ID=" "%s" % task_str
|
|
1179
1367
|
task_id = "$METAFLOW_TASK_ID"
|
|
1180
1368
|
|
|
1181
1369
|
# Resolve retry strategy.
|
|
@@ -1194,9 +1382,20 @@ class ArgoWorkflows(object):
|
|
|
1194
1382
|
user_code_retries = max_user_code_retries
|
|
1195
1383
|
total_retries = max_user_code_retries + max_error_retries
|
|
1196
1384
|
# {{retries}} is only available if retryStrategy is specified
|
|
1385
|
+
# and they are only available in the container templates NOT for custom
|
|
1386
|
+
# Kubernetes manifests like Jobsets.
|
|
1387
|
+
# For custom kubernetes manifests, we will pass the retryCount as a parameter
|
|
1388
|
+
# and use that in the manifest.
|
|
1197
1389
|
retry_count = (
|
|
1198
|
-
|
|
1390
|
+
(
|
|
1391
|
+
"{{retries}}"
|
|
1392
|
+
if not node.parallel_step
|
|
1393
|
+
else "{{inputs.parameters.retryCount}}"
|
|
1394
|
+
)
|
|
1395
|
+
if total_retries
|
|
1396
|
+
else 0
|
|
1199
1397
|
)
|
|
1398
|
+
|
|
1200
1399
|
minutes_between_retries = int(minutes_between_retries)
|
|
1201
1400
|
|
|
1202
1401
|
# Configure log capture.
|
|
@@ -1302,13 +1501,24 @@ class ArgoWorkflows(object):
|
|
|
1302
1501
|
foreach_step = next(
|
|
1303
1502
|
n for n in node.in_funcs if self.graph[n].is_inside_foreach
|
|
1304
1503
|
)
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1504
|
+
if not self.graph[node.split_parents[-1]].parallel_foreach:
|
|
1505
|
+
input_paths = (
|
|
1506
|
+
"$(python -m metaflow.plugins.argo.generate_input_paths %s {{workflow.creationTimestamp}} %s {{inputs.parameters.split-cardinality}})"
|
|
1507
|
+
% (
|
|
1508
|
+
foreach_step,
|
|
1509
|
+
input_paths,
|
|
1510
|
+
)
|
|
1511
|
+
)
|
|
1512
|
+
else:
|
|
1513
|
+
# When we run Jobsets with Argo Workflows we need to ensure that `input_paths` are generated using the a formulaic approach
|
|
1514
|
+
# because our current strategy of using volume mounts for outputs won't work with Jobsets
|
|
1515
|
+
input_paths = (
|
|
1516
|
+
"$(python -m metaflow.plugins.argo.jobset_input_paths %s %s {{inputs.parameters.task-id-entropy}} {{inputs.parameters.num-parallel}})"
|
|
1517
|
+
% (
|
|
1518
|
+
run_id,
|
|
1519
|
+
foreach_step,
|
|
1520
|
+
)
|
|
1310
1521
|
)
|
|
1311
|
-
)
|
|
1312
1522
|
step = [
|
|
1313
1523
|
"step",
|
|
1314
1524
|
node.name,
|
|
@@ -1318,7 +1528,14 @@ class ArgoWorkflows(object):
|
|
|
1318
1528
|
"--max-user-code-retries %d" % user_code_retries,
|
|
1319
1529
|
"--input-paths %s" % input_paths,
|
|
1320
1530
|
]
|
|
1321
|
-
if
|
|
1531
|
+
if node.parallel_step:
|
|
1532
|
+
step.append(
|
|
1533
|
+
"--split-index ${MF_CONTROL_INDEX:-$((MF_WORKER_REPLICA_INDEX + 1))}"
|
|
1534
|
+
)
|
|
1535
|
+
# This is needed for setting the value of the UBF context in the CLI.
|
|
1536
|
+
step.append("--ubf-context $UBF_CONTEXT")
|
|
1537
|
+
|
|
1538
|
+
elif any(self.graph[n].type == "foreach" for n in node.in_funcs):
|
|
1322
1539
|
# Pass split-index to a foreach task
|
|
1323
1540
|
step.append("--split-index {{inputs.parameters.split-index}}")
|
|
1324
1541
|
if self.tags:
|
|
@@ -1481,17 +1698,47 @@ class ArgoWorkflows(object):
|
|
|
1481
1698
|
# join task deterministically inside the join task without resorting to
|
|
1482
1699
|
# passing a rather long list of (albiet compressed)
|
|
1483
1700
|
inputs = []
|
|
1484
|
-
|
|
1701
|
+
# To set the input-paths as a parameter, we need to ensure that the node
|
|
1702
|
+
# is not (a start node or a parallel join node). Start nodes will have no
|
|
1703
|
+
# input paths and parallel join will derive input paths based on a
|
|
1704
|
+
# formulaic approach.
|
|
1705
|
+
if not (
|
|
1706
|
+
node.name == "start"
|
|
1707
|
+
or (node.type == "join" and self.graph[node.in_funcs[0]].parallel_step)
|
|
1708
|
+
):
|
|
1485
1709
|
inputs.append(Parameter("input-paths"))
|
|
1486
1710
|
if any(self.graph[n].type == "foreach" for n in node.in_funcs):
|
|
1487
1711
|
# Fetch split-index from parent
|
|
1488
1712
|
inputs.append(Parameter("split-index"))
|
|
1713
|
+
|
|
1489
1714
|
if (
|
|
1490
1715
|
node.type == "join"
|
|
1491
1716
|
and self.graph[node.split_parents[-1]].type == "foreach"
|
|
1492
1717
|
):
|
|
1493
|
-
#
|
|
1494
|
-
|
|
1718
|
+
# @parallel join tasks require `num-parallel` and `task-id-entropy`
|
|
1719
|
+
# to construct the input paths, so we pass them down as input parameters.
|
|
1720
|
+
if self.graph[node.split_parents[-1]].parallel_foreach:
|
|
1721
|
+
inputs.extend(
|
|
1722
|
+
[Parameter("num-parallel"), Parameter("task-id-entropy")]
|
|
1723
|
+
)
|
|
1724
|
+
else:
|
|
1725
|
+
# append this only for joins of foreaches, not static splits
|
|
1726
|
+
inputs.append(Parameter("split-cardinality"))
|
|
1727
|
+
# We can use an `elif` condition because the first `if` condition validates if its
|
|
1728
|
+
# a foreach join node, hence we can safely assume that if that condition fails then
|
|
1729
|
+
# we can check if the node is a @parallel node.
|
|
1730
|
+
elif node.parallel_step:
|
|
1731
|
+
inputs.extend(
|
|
1732
|
+
[
|
|
1733
|
+
Parameter("num-parallel"),
|
|
1734
|
+
Parameter("task-id-entropy"),
|
|
1735
|
+
Parameter("jobset-name"),
|
|
1736
|
+
Parameter("workerCount"),
|
|
1737
|
+
]
|
|
1738
|
+
)
|
|
1739
|
+
if any(d.name == "retry" for d in node.decorators):
|
|
1740
|
+
inputs.append(Parameter("retryCount"))
|
|
1741
|
+
|
|
1495
1742
|
if node.is_inside_foreach and self.graph[node.out_funcs[0]].type == "join":
|
|
1496
1743
|
if any(
|
|
1497
1744
|
self.graph[parent].matching_join
|
|
@@ -1508,7 +1755,9 @@ class ArgoWorkflows(object):
|
|
|
1508
1755
|
inputs.append(Parameter("root-input-path"))
|
|
1509
1756
|
|
|
1510
1757
|
outputs = []
|
|
1511
|
-
|
|
1758
|
+
# @parallel steps will not have a task-id as an output parameter since task-ids
|
|
1759
|
+
# are derived at runtime.
|
|
1760
|
+
if not (node.name == "end" or node.parallel_step):
|
|
1512
1761
|
outputs = [Parameter("task-id").valueFrom({"path": "/mnt/out/task_id"})]
|
|
1513
1762
|
if node.type == "foreach":
|
|
1514
1763
|
# Emit split cardinality from foreach task
|
|
@@ -1521,6 +1770,19 @@ class ArgoWorkflows(object):
|
|
|
1521
1770
|
)
|
|
1522
1771
|
)
|
|
1523
1772
|
|
|
1773
|
+
if node.parallel_foreach:
|
|
1774
|
+
outputs.extend(
|
|
1775
|
+
[
|
|
1776
|
+
Parameter("num-parallel").valueFrom(
|
|
1777
|
+
{"path": "/mnt/out/num_parallel"}
|
|
1778
|
+
),
|
|
1779
|
+
Parameter("task-id-entropy").valueFrom(
|
|
1780
|
+
{"path": "/mnt/out/task_id_entropy"}
|
|
1781
|
+
),
|
|
1782
|
+
]
|
|
1783
|
+
)
|
|
1784
|
+
# Outputs should be defined over here, Not in the _dag_template for the `num_parallel` stuff.
|
|
1785
|
+
|
|
1524
1786
|
# It makes no sense to set env vars to None (shows up as "None" string)
|
|
1525
1787
|
# Also we skip some env vars (e.g. in case we want to pull them from KUBERNETES_SECRETS)
|
|
1526
1788
|
env = {
|
|
@@ -1550,6 +1812,156 @@ class ArgoWorkflows(object):
|
|
|
1550
1812
|
# liked to inline this ContainerTemplate and avoid scanning the workflow
|
|
1551
1813
|
# twice, but due to issues with variable substitution, we will have to
|
|
1552
1814
|
# live with this routine.
|
|
1815
|
+
if node.parallel_step:
|
|
1816
|
+
|
|
1817
|
+
# Explicitly add the task-id-hint label. This is important because this label
|
|
1818
|
+
# is returned as an Output parameter of this step and is used subsequently an
|
|
1819
|
+
# an input in the join step. Even the num_parallel is used as an output parameter
|
|
1820
|
+
kubernetes_labels = self.kubernetes_labels.copy()
|
|
1821
|
+
jobset_name = "{{inputs.parameters.jobset-name}}"
|
|
1822
|
+
kubernetes_labels[
|
|
1823
|
+
"task_id_entropy"
|
|
1824
|
+
] = "{{inputs.parameters.task-id-entropy}}"
|
|
1825
|
+
kubernetes_labels["num_parallel"] = "{{inputs.parameters.num-parallel}}"
|
|
1826
|
+
jobset = KubernetesArgoJobSet(
|
|
1827
|
+
kubernetes_sdk=kubernetes_sdk,
|
|
1828
|
+
name=jobset_name,
|
|
1829
|
+
flow_name=self.flow.name,
|
|
1830
|
+
run_id=run_id,
|
|
1831
|
+
step_name=self._sanitize(node.name),
|
|
1832
|
+
task_id=task_id,
|
|
1833
|
+
attempt=retry_count,
|
|
1834
|
+
user=self.username,
|
|
1835
|
+
subdomain=jobset_name,
|
|
1836
|
+
command=cmds,
|
|
1837
|
+
namespace=resources["namespace"],
|
|
1838
|
+
image=resources["image"],
|
|
1839
|
+
image_pull_policy=resources["image_pull_policy"],
|
|
1840
|
+
service_account=resources["service_account"],
|
|
1841
|
+
secrets=(
|
|
1842
|
+
[
|
|
1843
|
+
k
|
|
1844
|
+
for k in (
|
|
1845
|
+
list(
|
|
1846
|
+
[]
|
|
1847
|
+
if not resources.get("secrets")
|
|
1848
|
+
else [resources.get("secrets")]
|
|
1849
|
+
if isinstance(resources.get("secrets"), str)
|
|
1850
|
+
else resources.get("secrets")
|
|
1851
|
+
)
|
|
1852
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
1853
|
+
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
1854
|
+
)
|
|
1855
|
+
if k
|
|
1856
|
+
]
|
|
1857
|
+
),
|
|
1858
|
+
node_selector=resources.get("node_selector"),
|
|
1859
|
+
cpu=str(resources["cpu"]),
|
|
1860
|
+
memory=str(resources["memory"]),
|
|
1861
|
+
disk=str(resources["disk"]),
|
|
1862
|
+
gpu=resources["gpu"],
|
|
1863
|
+
gpu_vendor=str(resources["gpu_vendor"]),
|
|
1864
|
+
tolerations=resources["tolerations"],
|
|
1865
|
+
use_tmpfs=use_tmpfs,
|
|
1866
|
+
tmpfs_tempdir=tmpfs_tempdir,
|
|
1867
|
+
tmpfs_size=tmpfs_size,
|
|
1868
|
+
tmpfs_path=tmpfs_path,
|
|
1869
|
+
timeout_in_seconds=run_time_limit,
|
|
1870
|
+
persistent_volume_claims=resources["persistent_volume_claims"],
|
|
1871
|
+
shared_memory=shared_memory,
|
|
1872
|
+
port=port,
|
|
1873
|
+
)
|
|
1874
|
+
|
|
1875
|
+
for k, v in env.items():
|
|
1876
|
+
jobset.environment_variable(k, v)
|
|
1877
|
+
|
|
1878
|
+
for k, v in kubernetes_labels.items():
|
|
1879
|
+
jobset.label(k, v)
|
|
1880
|
+
|
|
1881
|
+
## -----Jobset specific env vars START here-----
|
|
1882
|
+
jobset.environment_variable(
|
|
1883
|
+
"MF_MASTER_ADDR", jobset.jobset_control_addr
|
|
1884
|
+
)
|
|
1885
|
+
jobset.environment_variable("MF_MASTER_PORT", str(port))
|
|
1886
|
+
jobset.environment_variable(
|
|
1887
|
+
"MF_WORLD_SIZE", "{{inputs.parameters.num-parallel}}"
|
|
1888
|
+
)
|
|
1889
|
+
# for k, v in .items():
|
|
1890
|
+
jobset.environment_variables_from_selectors(
|
|
1891
|
+
{
|
|
1892
|
+
"MF_WORKER_REPLICA_INDEX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
|
|
1893
|
+
"JOBSET_RESTART_ATTEMPT": "metadata.annotations['jobset.sigs.k8s.io/restart-attempt']",
|
|
1894
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME": "metadata.annotations['jobset.sigs.k8s.io/jobset-name']",
|
|
1895
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
1896
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
1897
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
1898
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
1899
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
1900
|
+
# `TASK_ID_SUFFIX` is needed for the construction of the task-ids
|
|
1901
|
+
"TASK_ID_SUFFIX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
|
|
1902
|
+
}
|
|
1903
|
+
)
|
|
1904
|
+
annotations = {
|
|
1905
|
+
# setting annotations explicitly as they wont be
|
|
1906
|
+
# passed down from WorkflowTemplate level
|
|
1907
|
+
"metaflow/step_name": node.name,
|
|
1908
|
+
"metaflow/attempt": str(retry_count),
|
|
1909
|
+
"metaflow/run_id": run_id,
|
|
1910
|
+
"metaflow/production_token": self.production_token,
|
|
1911
|
+
"metaflow/owner": self.username,
|
|
1912
|
+
"metaflow/user": "argo-workflows",
|
|
1913
|
+
"metaflow/flow_name": self.flow.name,
|
|
1914
|
+
}
|
|
1915
|
+
if current.get("project_name"):
|
|
1916
|
+
annotations.update(
|
|
1917
|
+
{
|
|
1918
|
+
"metaflow/project_name": current.project_name,
|
|
1919
|
+
"metaflow/branch_name": current.branch_name,
|
|
1920
|
+
"metaflow/project_flow_name": current.project_flow_name,
|
|
1921
|
+
}
|
|
1922
|
+
)
|
|
1923
|
+
for k, v in annotations.items():
|
|
1924
|
+
jobset.annotation(k, v)
|
|
1925
|
+
## -----Jobset specific env vars END here-----
|
|
1926
|
+
## ---- Jobset control/workers specific vars START here ----
|
|
1927
|
+
jobset.control.replicas(1)
|
|
1928
|
+
jobset.worker.replicas("{{=asInt(inputs.parameters.workerCount)}}")
|
|
1929
|
+
jobset.control.environment_variable("UBF_CONTEXT", UBF_CONTROL)
|
|
1930
|
+
jobset.worker.environment_variable("UBF_CONTEXT", UBF_TASK)
|
|
1931
|
+
jobset.control.environment_variable("MF_CONTROL_INDEX", "0")
|
|
1932
|
+
# `TASK_ID_PREFIX` needs to explicitly be `control` or `worker`
|
|
1933
|
+
# because the join task uses a formulaic approach to infer the task-ids
|
|
1934
|
+
jobset.control.environment_variable("TASK_ID_PREFIX", "control")
|
|
1935
|
+
jobset.worker.environment_variable("TASK_ID_PREFIX", "worker")
|
|
1936
|
+
|
|
1937
|
+
## ---- Jobset control/workers specific vars END here ----
|
|
1938
|
+
yield (
|
|
1939
|
+
Template(ArgoWorkflows._sanitize(node.name))
|
|
1940
|
+
.resource(
|
|
1941
|
+
"create",
|
|
1942
|
+
jobset.dump(),
|
|
1943
|
+
"status.terminalState == Completed",
|
|
1944
|
+
"status.terminalState == Failed",
|
|
1945
|
+
)
|
|
1946
|
+
.inputs(Inputs().parameters(inputs))
|
|
1947
|
+
.outputs(
|
|
1948
|
+
Outputs().parameters(
|
|
1949
|
+
[
|
|
1950
|
+
Parameter("task-id-entropy").valueFrom(
|
|
1951
|
+
{"jsonPath": "{.metadata.labels.task_id_entropy}"}
|
|
1952
|
+
),
|
|
1953
|
+
Parameter("num-parallel").valueFrom(
|
|
1954
|
+
{"jsonPath": "{.metadata.labels.num_parallel}"}
|
|
1955
|
+
),
|
|
1956
|
+
]
|
|
1957
|
+
)
|
|
1958
|
+
)
|
|
1959
|
+
.retry_strategy(
|
|
1960
|
+
times=total_retries,
|
|
1961
|
+
minutes_between_retries=minutes_between_retries,
|
|
1962
|
+
)
|
|
1963
|
+
)
|
|
1964
|
+
continue
|
|
1553
1965
|
yield (
|
|
1554
1966
|
Template(self._sanitize(node.name))
|
|
1555
1967
|
# Set @timeout values
|
|
@@ -1847,7 +2259,7 @@ class ArgoWorkflows(object):
|
|
|
1847
2259
|
"fields": [
|
|
1848
2260
|
{
|
|
1849
2261
|
"type": "mrkdwn",
|
|
1850
|
-
"text": "*Project:* %s" % current.project_name
|
|
2262
|
+
"text": "*Project:* %s" % current.project_name
|
|
1851
2263
|
},
|
|
1852
2264
|
{
|
|
1853
2265
|
"type": "mrkdwn",
|
|
@@ -2621,6 +3033,15 @@ class Template(object):
|
|
|
2621
3033
|
def to_json(self):
|
|
2622
3034
|
return self.payload
|
|
2623
3035
|
|
|
3036
|
+
def resource(self, action, manifest, success_criteria, failure_criteria):
|
|
3037
|
+
self.payload["resource"] = {}
|
|
3038
|
+
self.payload["resource"]["action"] = action
|
|
3039
|
+
self.payload["setOwnerReference"] = True
|
|
3040
|
+
self.payload["resource"]["successCondition"] = success_criteria
|
|
3041
|
+
self.payload["resource"]["failureCondition"] = failure_criteria
|
|
3042
|
+
self.payload["resource"]["manifest"] = manifest
|
|
3043
|
+
return self
|
|
3044
|
+
|
|
2624
3045
|
def __str__(self):
|
|
2625
3046
|
return json.dumps(self.payload, indent=4)
|
|
2626
3047
|
|