@flyteorg/flyteidl 1.2.9 → 1.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/gen/pb-js/flyteidl.d.ts +1106 -2
- package/gen/pb-js/flyteidl.js +2723 -196
- package/package.json +1 -1
- package/protos/flyteidl/admin/common.proto +17 -0
- package/protos/flyteidl/admin/execution.proto +20 -0
- package/protos/flyteidl/admin/launch_plan.proto +3 -0
- package/protos/flyteidl/admin/matchable_resource.proto +3 -0
- package/protos/flyteidl/admin/node_execution.proto +11 -1
- package/protos/flyteidl/admin/task_execution.proto +17 -0
- package/protos/flyteidl/core/metrics.proto +36 -0
- package/protos/flyteidl/core/security.proto +3 -0
- package/protos/flyteidl/core/tasks.proto +21 -13
- package/protos/flyteidl/event/event.proto +16 -0
- package/protos/flyteidl/plugins/kubeflow/common.proto +33 -0
- package/protos/flyteidl/plugins/kubeflow/mpi.proto +43 -0
- package/protos/flyteidl/plugins/kubeflow/pytorch.proto +49 -0
- package/protos/flyteidl/plugins/kubeflow/tensorflow.proto +39 -0
- package/protos/flyteidl/plugins/pytorch.proto +14 -0
- package/protos/flyteidl/service/admin.proto +10 -1
- package/protos/flyteidl/service/dataproxy.proto +40 -0
- package/protos/flyteidl/service/external_plugin_service.proto +74 -0
package/package.json
CHANGED
|
@@ -5,6 +5,8 @@ option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/admin";
|
|
|
5
5
|
|
|
6
6
|
import "flyteidl/core/execution.proto";
|
|
7
7
|
import "flyteidl/core/identifier.proto";
|
|
8
|
+
import "flyteidl/core/literals.proto";
|
|
9
|
+
import "google/protobuf/timestamp.proto";
|
|
8
10
|
|
|
9
11
|
// Encapsulation of fields that identifies a Flyte resource.
|
|
10
12
|
// A Flyte resource can be a task, workflow or launch plan.
|
|
@@ -279,6 +281,14 @@ message Annotations {
|
|
|
279
281
|
map<string, string> values = 1;
|
|
280
282
|
}
|
|
281
283
|
|
|
284
|
+
// Environment variable values to be applied to an execution resource.
|
|
285
|
+
// In the future a mode (e.g. OVERRIDE, APPEND, etc) can be defined
|
|
286
|
+
// to specify how to merge environment variables defined at registration and execution time.
|
|
287
|
+
message Envs {
|
|
288
|
+
// Map of custom environment variables to be applied to the execution resource.
|
|
289
|
+
repeated flyteidl.core.KeyValuePair values = 1;
|
|
290
|
+
}
|
|
291
|
+
|
|
282
292
|
// Defines permissions associated with executions created by this launch plan spec.
|
|
283
293
|
// Use either of these roles when they have permissions required by your workflow execution.
|
|
284
294
|
// Deprecated.
|
|
@@ -300,3 +310,10 @@ message RawOutputDataConfig {
|
|
|
300
310
|
// e.g. s3://bucket/key or s3://bucket/
|
|
301
311
|
string output_location_prefix = 1;
|
|
302
312
|
}
|
|
313
|
+
|
|
314
|
+
// These URLs are returned as part of node and task execution data requests.
|
|
315
|
+
message FlyteURLs {
|
|
316
|
+
string inputs = 1;
|
|
317
|
+
string outputs = 2;
|
|
318
|
+
string deck = 3;
|
|
319
|
+
}
|
|
@@ -8,6 +8,7 @@ import "flyteidl/admin/common.proto";
|
|
|
8
8
|
import "flyteidl/core/literals.proto";
|
|
9
9
|
import "flyteidl/core/execution.proto";
|
|
10
10
|
import "flyteidl/core/identifier.proto";
|
|
11
|
+
import "flyteidl/core/metrics.proto";
|
|
11
12
|
import "flyteidl/core/security.proto";
|
|
12
13
|
import "google/protobuf/duration.proto";
|
|
13
14
|
import "google/protobuf/timestamp.proto";
|
|
@@ -309,6 +310,9 @@ message ExecutionSpec {
|
|
|
309
310
|
// If enabled, all calculations are performed even if cached results would be available, overwriting the stored
|
|
310
311
|
// data once execution finishes successfully.
|
|
311
312
|
bool overwrite_cache = 22;
|
|
313
|
+
|
|
314
|
+
// Environment variables to be set for the execution.
|
|
315
|
+
Envs envs = 23;
|
|
312
316
|
}
|
|
313
317
|
|
|
314
318
|
// Request to terminate an in-progress execution. This action is irreversible.
|
|
@@ -381,3 +385,19 @@ message ExecutionStateChangeDetails {
|
|
|
381
385
|
}
|
|
382
386
|
|
|
383
387
|
message ExecutionUpdateResponse {}
|
|
388
|
+
|
|
389
|
+
// WorkflowExecutionGetMetricsRequest represents a request to retrieve metrics for the specified workflow execution.
|
|
390
|
+
message WorkflowExecutionGetMetricsRequest {
|
|
391
|
+
// id defines the workflow execution to query for.
|
|
392
|
+
core.WorkflowExecutionIdentifier id = 1;
|
|
393
|
+
|
|
394
|
+
// depth defines the number of Flyte entity levels to traverse when breaking down execution details.
|
|
395
|
+
int32 depth = 2;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// WorkflowExecutionGetMetricsResponse represents the response containing metrics for the specified workflow execution.
|
|
399
|
+
message WorkflowExecutionGetMetricsResponse {
|
|
400
|
+
// Span defines the top-level breakdown of the workflows execution. More precise information is nested in a
|
|
401
|
+
// hierarchical structure using Flyte entity references.
|
|
402
|
+
core.Span span = 1;
|
|
403
|
+
}
|
|
@@ -130,6 +130,9 @@ message LaunchPlanSpec {
|
|
|
130
130
|
// If enabled, all calculations are performed even if cached results would be available, overwriting the stored
|
|
131
131
|
// data once execution finishes successfully.
|
|
132
132
|
bool overwrite_cache = 20;
|
|
133
|
+
|
|
134
|
+
// Environment variables to be set for the execution.
|
|
135
|
+
Envs envs = 21;
|
|
133
136
|
}
|
|
134
137
|
|
|
135
138
|
// Values computed by the flyte platform after launch plan registration.
|
|
@@ -128,6 +128,9 @@ message WorkflowExecutionConfig {
|
|
|
128
128
|
// If enabled, all calculations are performed even if cached results would be available, overwriting the stored
|
|
129
129
|
// data once execution finishes successfully.
|
|
130
130
|
bool overwrite_cache = 7;
|
|
131
|
+
|
|
132
|
+
// Environment variables to be set for the execution.
|
|
133
|
+
Envs envs = 8;
|
|
131
134
|
}
|
|
132
135
|
|
|
133
136
|
// Generic container for encapsulating all types of the above attributes messages.
|
|
@@ -167,6 +167,10 @@ message NodeExecutionClosure {
|
|
|
167
167
|
// String location uniquely identifying where the deck HTML file is.
|
|
168
168
|
// NativeUrl specifies the url in the format of the configured storage provider (e.g. s3://my-bucket/randomstring/suffix.tar)
|
|
169
169
|
string deck_uri = 11;
|
|
170
|
+
|
|
171
|
+
// dynamic_job_spec_uri is the location of the DynamicJobSpec proto message for a DynamicWorkflow. This is required
|
|
172
|
+
// to correctly recover partially completed executions where the subworkflow has already been compiled.
|
|
173
|
+
string dynamic_job_spec_uri = 12;
|
|
170
174
|
}
|
|
171
175
|
|
|
172
176
|
// Metadata for a WorkflowNode
|
|
@@ -192,6 +196,10 @@ message DynamicWorkflowNodeMetadata {
|
|
|
192
196
|
|
|
193
197
|
// Represents the compiled representation of the embedded dynamic workflow.
|
|
194
198
|
core.CompiledWorkflowClosure compiled_workflow = 2;
|
|
199
|
+
|
|
200
|
+
// dynamic_job_spec_uri is the location of the DynamicJobSpec proto message for this DynamicWorkflow. This is
|
|
201
|
+
// required to correctly recover partially completed executions where the subworkflow has already been compiled.
|
|
202
|
+
string dynamic_job_spec_uri = 3;
|
|
195
203
|
}
|
|
196
204
|
|
|
197
205
|
// Request structure to fetch inputs and output for a node execution.
|
|
@@ -219,5 +227,7 @@ message NodeExecutionGetDataResponse {
|
|
|
219
227
|
|
|
220
228
|
// Optional Workflow closure for a dynamically generated workflow, in the case this node yields a dynamic workflow we return its structure here.
|
|
221
229
|
DynamicWorkflowNodeMetadata dynamic_workflow = 16;
|
|
222
|
-
}
|
|
223
230
|
|
|
231
|
+
FlyteURLs flyte_urls = 17;
|
|
232
|
+
|
|
233
|
+
}
|
|
@@ -123,6 +123,19 @@ message TaskExecutionClosure {
|
|
|
123
123
|
// TaskExecutionMetadata ExternalResourceInfo fields for each subtask rather than the TaskLog
|
|
124
124
|
// in this message.
|
|
125
125
|
int32 event_version = 17;
|
|
126
|
+
|
|
127
|
+
// A time-series of the phase transition or update explanations. This, when compared to storing a singular reason
|
|
128
|
+
// as previously done, is much more valuable in visualizing and understanding historical evaluations.
|
|
129
|
+
repeated Reason reasons = 18;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Reason is a single message annotated with a timestamp to indicate the instant the reason occurred.
|
|
133
|
+
message Reason {
|
|
134
|
+
// occurred_at is the timestamp indicating the instant that this reason happened.
|
|
135
|
+
google.protobuf.Timestamp occurred_at = 1;
|
|
136
|
+
|
|
137
|
+
// message is the explanation for the most recent phase transition or status update.
|
|
138
|
+
string message = 2;
|
|
126
139
|
}
|
|
127
140
|
|
|
128
141
|
// Request structure to fetch inputs and output for a task execution.
|
|
@@ -148,4 +161,8 @@ message TaskExecutionGetDataResponse {
|
|
|
148
161
|
|
|
149
162
|
// Full_outputs will only be populated if they are under a configured size threshold.
|
|
150
163
|
core.LiteralMap full_outputs = 4;
|
|
164
|
+
|
|
165
|
+
// flyte tiny url to fetch a core.LiteralMap of task execution's IO
|
|
166
|
+
// Deck will be empty for task
|
|
167
|
+
FlyteURLs flyte_urls = 5;
|
|
151
168
|
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.core;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/identifier.proto";
|
|
8
|
+
import "google/protobuf/timestamp.proto";
|
|
9
|
+
|
|
10
|
+
// Span represents a duration trace of Flyte execution. The id field denotes a Flyte execution entity or an operation
|
|
11
|
+
// which uniquely identifies the Span. The spans attribute allows this Span to be further broken down into more
|
|
12
|
+
// precise definitions.
|
|
13
|
+
message Span {
|
|
14
|
+
// start_time defines the instance this span began.
|
|
15
|
+
google.protobuf.Timestamp start_time = 1;
|
|
16
|
+
|
|
17
|
+
// end_time defines the instance this span completed.
|
|
18
|
+
google.protobuf.Timestamp end_time = 2;
|
|
19
|
+
|
|
20
|
+
oneof id {
|
|
21
|
+
// workflow_id is the id of the workflow execution this Span represents.
|
|
22
|
+
flyteidl.core.WorkflowExecutionIdentifier workflow_id = 3;
|
|
23
|
+
|
|
24
|
+
// node_id is the id of the node execution this Span represents.
|
|
25
|
+
flyteidl.core.NodeExecutionIdentifier node_id = 4;
|
|
26
|
+
|
|
27
|
+
// task_id is the id of the task execution this Span represents.
|
|
28
|
+
flyteidl.core.TaskExecutionIdentifier task_id = 5;
|
|
29
|
+
|
|
30
|
+
// operation_id is the id of a unique operation that this Span represents.
|
|
31
|
+
string operation_id = 6;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// spans defines a collection of Spans that breakdown this execution.
|
|
35
|
+
repeated Span spans = 7;
|
|
36
|
+
}
|
|
@@ -69,6 +69,9 @@ message Identity {
|
|
|
69
69
|
// oauth2_client references an oauth2 client. Backend plugins can use this information to impersonate the client when
|
|
70
70
|
// making external calls.
|
|
71
71
|
OAuth2Client oauth2_client = 3;
|
|
72
|
+
|
|
73
|
+
// execution_identity references the subject who makes the execution
|
|
74
|
+
string execution_identity = 4;
|
|
72
75
|
}
|
|
73
76
|
|
|
74
77
|
// OAuth2TokenRequest encapsulates information needed to request an OAuth2 token.
|
|
@@ -268,24 +268,32 @@ message DataLoadingConfig {
|
|
|
268
268
|
|
|
269
269
|
// Defines a pod spec and additional pod metadata that is created when a task is executed.
|
|
270
270
|
message K8sPod {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
271
|
+
// Contains additional metadata for building a kubernetes pod.
|
|
272
|
+
K8sObjectMetadata metadata = 1;
|
|
273
|
+
|
|
274
|
+
// Defines the primary pod spec created when a task is executed.
|
|
275
|
+
// This should be a JSON-marshalled pod spec, which can be defined in
|
|
276
|
+
// - go, using: https://github.com/kubernetes/api/blob/release-1.21/core/v1/types.go#L2936
|
|
277
|
+
// - python: using https://github.com/kubernetes-client/python/blob/release-19.0/kubernetes/client/models/v1_pod_spec.py
|
|
278
|
+
google.protobuf.Struct pod_spec = 2;
|
|
279
|
+
|
|
280
|
+
// BETA: Optional configuration for DataLoading. If not specified, then default values are used.
|
|
281
|
+
// This makes it possible to to run a completely portable container, that uses inputs and outputs
|
|
282
|
+
// only from the local file-system and without having any reference to flytekit. This is supported only on K8s at the moment.
|
|
283
|
+
// If data loading is enabled, then data will be mounted in accompanying directories specified in the DataLoadingConfig. If the directories
|
|
284
|
+
// are not specified, inputs will be mounted onto and outputs will be uploaded from a pre-determined file-system path. Refer to the documentation
|
|
285
|
+
// to understand the default paths.
|
|
286
|
+
// Only K8s
|
|
287
|
+
DataLoadingConfig data_config = 3;
|
|
279
288
|
}
|
|
280
289
|
|
|
281
290
|
// Metadata for building a kubernetes object when a task is executed.
|
|
282
291
|
message K8sObjectMetadata {
|
|
292
|
+
// Optional labels to add to the pod definition.
|
|
293
|
+
map<string, string> labels = 1;
|
|
283
294
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
// Optional annotations to add to the pod definition.
|
|
288
|
-
map<string, string> annotations = 2;
|
|
295
|
+
// Optional annotations to add to the pod definition.
|
|
296
|
+
map<string, string> annotations = 2;
|
|
289
297
|
}
|
|
290
298
|
|
|
291
299
|
// Sql represents a generic sql workload with a statement and dialect.
|
|
@@ -104,6 +104,12 @@ message NodeExecutionEvent {
|
|
|
104
104
|
// String location uniquely identifying where the deck HTML file is
|
|
105
105
|
// NativeUrl specifies the url in the format of the configured storage provider (e.g. s3://my-bucket/randomstring/suffix.tar)
|
|
106
106
|
string deck_uri = 19;
|
|
107
|
+
|
|
108
|
+
// This timestamp represents the instant when the event was reported by the executing framework. For example,
|
|
109
|
+
// when first processing a node the `occurred_at` timestamp should be the instant propeller makes progress, so when
|
|
110
|
+
// literal inputs are initially copied. The event however will not be sent until after the copy completes.
|
|
111
|
+
// Extracting both of these timestamps facilitates a more accurate portrayal of the evaluation time-series.
|
|
112
|
+
google.protobuf.Timestamp reported_at = 21;
|
|
107
113
|
}
|
|
108
114
|
|
|
109
115
|
// For Workflow Nodes we need to send information about the workflow that's launched
|
|
@@ -132,6 +138,10 @@ message DynamicWorkflowNodeMetadata {
|
|
|
132
138
|
|
|
133
139
|
// Represents the compiled representation of the embedded dynamic workflow.
|
|
134
140
|
core.CompiledWorkflowClosure compiled_workflow = 2;
|
|
141
|
+
|
|
142
|
+
// dynamic_job_spec_uri is the location of the DynamicJobSpec proto message for this DynamicWorkflow. This is
|
|
143
|
+
// required to correctly recover partially completed executions where the workflow has already been compiled.
|
|
144
|
+
string dynamic_job_spec_uri = 3;
|
|
135
145
|
}
|
|
136
146
|
|
|
137
147
|
message ParentTaskExecutionMetadata {
|
|
@@ -217,6 +227,12 @@ message TaskExecutionEvent {
|
|
|
217
227
|
// TaskExecutionMetadata ExternalResourceInfo fields for each subtask rather than the TaskLog
|
|
218
228
|
// in this message.
|
|
219
229
|
int32 event_version = 18;
|
|
230
|
+
|
|
231
|
+
// This timestamp represents the instant when the event was reported by the executing framework. For example, a k8s
|
|
232
|
+
// pod task may be marked completed at (ie. `occurred_at`) the instant the container running user code completes,
|
|
233
|
+
// but this event will not be reported until the pod is marked as completed. Extracting both of these timestamps
|
|
234
|
+
// facilitates a more accurate portrayal of the evaluation time-series.
|
|
235
|
+
google.protobuf.Timestamp reported_at = 20;
|
|
220
236
|
}
|
|
221
237
|
|
|
222
238
|
// This message contains metadata about external resources produced or used by a specific task execution.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
enum RestartPolicy {
|
|
9
|
+
RESTART_POLICY_NEVER = 0;
|
|
10
|
+
RESTART_POLICY_ON_FAILURE = 1;
|
|
11
|
+
RESTART_POLICY_ALWAYS = 2;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
enum CleanPodPolicy {
|
|
15
|
+
CLEANPOD_POLICY_NONE = 0;
|
|
16
|
+
CLEANPOD_POLICY_RUNNING = 1;
|
|
17
|
+
CLEANPOD_POLICY_ALL = 2;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
message RunPolicy {
|
|
21
|
+
// Defines the policy to kill pods after the job completes. Default to None.
|
|
22
|
+
CleanPodPolicy clean_pod_policy = 1;
|
|
23
|
+
|
|
24
|
+
// TTL to clean up jobs. Default to infinite.
|
|
25
|
+
int32 ttl_seconds_after_finished = 2;
|
|
26
|
+
|
|
27
|
+
// Specifies the duration in seconds relative to the startTime that the job may be active
|
|
28
|
+
// before the system tries to terminate it; value must be positive integer.
|
|
29
|
+
int32 active_deadline_seconds = 3;
|
|
30
|
+
|
|
31
|
+
// Number of retries before marking this job failed.
|
|
32
|
+
int32 backoff_limit = 4;
|
|
33
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/mpi-operator
|
|
11
|
+
message DistributedMPITrainingTask {
|
|
12
|
+
// Worker replicas spec
|
|
13
|
+
DistributedMPITrainingReplicaSpec worker_replicas = 1;
|
|
14
|
+
|
|
15
|
+
// Master replicas spec
|
|
16
|
+
DistributedMPITrainingReplicaSpec launcher_replicas = 2;
|
|
17
|
+
|
|
18
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
19
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
20
|
+
// active.
|
|
21
|
+
RunPolicy run_policy = 3;
|
|
22
|
+
|
|
23
|
+
// Number of slots per worker
|
|
24
|
+
int32 slots = 4;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Replica specification for distributed MPI training
|
|
28
|
+
message DistributedMPITrainingReplicaSpec {
|
|
29
|
+
// Number of replicas
|
|
30
|
+
int32 replicas = 1;
|
|
31
|
+
|
|
32
|
+
// Image used for the replica group
|
|
33
|
+
string image = 2;
|
|
34
|
+
|
|
35
|
+
// Resources required for the replica group
|
|
36
|
+
core.Resources resources = 3;
|
|
37
|
+
|
|
38
|
+
// Restart policy determines whether pods will be restarted when they exit
|
|
39
|
+
RestartPolicy restart_policy = 4;
|
|
40
|
+
|
|
41
|
+
// MPI sometimes requires different command set for different replica groups
|
|
42
|
+
repeated string command = 5;
|
|
43
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Custom proto for torch elastic config for distributed training using
|
|
11
|
+
// https://github.com/kubeflow/training-operator/blob/master/pkg/apis/kubeflow.org/v1/pytorch_types.go
|
|
12
|
+
message ElasticConfig {
|
|
13
|
+
string rdzv_backend = 1;
|
|
14
|
+
int32 min_replicas = 2;
|
|
15
|
+
int32 max_replicas = 3;
|
|
16
|
+
int32 nproc_per_node = 4;
|
|
17
|
+
int32 max_restarts = 5;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/pytorch-operator
|
|
21
|
+
message DistributedPyTorchTrainingTask {
|
|
22
|
+
// Worker replicas spec
|
|
23
|
+
DistributedPyTorchTrainingReplicaSpec worker_replicas = 1;
|
|
24
|
+
|
|
25
|
+
// Master replicas spec, master replicas can only have 1 replica
|
|
26
|
+
DistributedPyTorchTrainingReplicaSpec master_replicas = 2;
|
|
27
|
+
|
|
28
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
29
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
30
|
+
// active.
|
|
31
|
+
RunPolicy run_policy = 3;
|
|
32
|
+
|
|
33
|
+
// config for an elastic pytorch job
|
|
34
|
+
ElasticConfig elastic_config = 4;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
message DistributedPyTorchTrainingReplicaSpec {
|
|
38
|
+
// Number of replicas
|
|
39
|
+
int32 replicas = 1;
|
|
40
|
+
|
|
41
|
+
// Image used for the replica group
|
|
42
|
+
string image = 2;
|
|
43
|
+
|
|
44
|
+
// Resources required for the replica group
|
|
45
|
+
core.Resources resources = 3;
|
|
46
|
+
|
|
47
|
+
// RestartPolicy determines whether pods will be restarted when they exit
|
|
48
|
+
RestartPolicy restart_policy = 4;
|
|
49
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/tf-operator
|
|
11
|
+
message DistributedTensorflowTrainingTask {
|
|
12
|
+
// Worker replicas spec
|
|
13
|
+
DistributedTensorflowTrainingReplicaSpec worker_replicas = 1;
|
|
14
|
+
|
|
15
|
+
// Parameter server replicas spec
|
|
16
|
+
DistributedTensorflowTrainingReplicaSpec ps_replicas = 2;
|
|
17
|
+
|
|
18
|
+
// Chief replicas spec
|
|
19
|
+
DistributedTensorflowTrainingReplicaSpec chief_replicas = 3;
|
|
20
|
+
|
|
21
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
22
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
23
|
+
// active.
|
|
24
|
+
RunPolicy run_policy = 4;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
message DistributedTensorflowTrainingReplicaSpec {
|
|
28
|
+
// Number of replicas
|
|
29
|
+
int32 replicas = 1;
|
|
30
|
+
|
|
31
|
+
// Image used for the replica group
|
|
32
|
+
string image = 2;
|
|
33
|
+
|
|
34
|
+
// Resources required for the replica group
|
|
35
|
+
core.Resources resources = 3;
|
|
36
|
+
|
|
37
|
+
// RestartPolicy Determines whether pods will be restarted when they exit
|
|
38
|
+
RestartPolicy restart_policy = 4;
|
|
39
|
+
}
|
|
@@ -4,8 +4,22 @@ package flyteidl.plugins;
|
|
|
4
4
|
|
|
5
5
|
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
6
|
|
|
7
|
+
// Custom proto for torch elastic config for distributed training using
|
|
8
|
+
// https://github.com/kubeflow/training-operator/blob/master/pkg/apis/kubeflow.org/v1/pytorch_types.go
|
|
9
|
+
message ElasticConfig {
|
|
10
|
+
string rdzv_backend = 1;
|
|
11
|
+
int32 min_replicas = 2;
|
|
12
|
+
int32 max_replicas = 3;
|
|
13
|
+
int32 nproc_per_node = 4;
|
|
14
|
+
int32 max_restarts = 5;
|
|
15
|
+
}
|
|
16
|
+
|
|
7
17
|
// Custom proto for plugin that enables distributed training using https://github.com/kubeflow/pytorch-operator
|
|
8
18
|
message DistributedPyTorchTrainingTask {
|
|
9
19
|
// number of worker replicas spawned in the cluster for this job
|
|
10
20
|
int32 workers = 1;
|
|
21
|
+
|
|
22
|
+
// config for an elastic pytorch job
|
|
23
|
+
//
|
|
24
|
+
ElasticConfig elastic_config = 2;
|
|
11
25
|
}
|
|
@@ -19,7 +19,6 @@ import "flyteidl/admin/task_execution.proto";
|
|
|
19
19
|
import "flyteidl/admin/version.proto";
|
|
20
20
|
import "flyteidl/admin/common.proto";
|
|
21
21
|
import "flyteidl/admin/description_entity.proto";
|
|
22
|
-
import "flyteidl/core/identifier.proto";
|
|
23
22
|
// import "protoc-gen-swagger/options/annotations.proto";
|
|
24
23
|
|
|
25
24
|
// The following defines an RPC service that is also served over HTTP via grpc-gateway.
|
|
@@ -627,4 +626,14 @@ service AdminService {
|
|
|
627
626
|
// description: "Fetch existing description entity definitions matching input filters."
|
|
628
627
|
// };
|
|
629
628
|
}
|
|
629
|
+
|
|
630
|
+
// Fetches runtime metrics for a :ref:`ref_flyteidl.admin.Execution`.
|
|
631
|
+
rpc GetExecutionMetrics (flyteidl.admin.WorkflowExecutionGetMetricsRequest) returns (flyteidl.admin.WorkflowExecutionGetMetricsResponse) {
|
|
632
|
+
option (google.api.http) = {
|
|
633
|
+
get: "/api/v1/metrics/executions/{id.project}/{id.domain}/{id.name}"
|
|
634
|
+
};
|
|
635
|
+
// option (grpc.gateway.protoc_gen_swagger.options.openapiv2_operation) = {
|
|
636
|
+
// description: "Retrieve metrics from an existing workflow execution."
|
|
637
|
+
// };
|
|
638
|
+
};
|
|
630
639
|
}
|
|
@@ -8,6 +8,8 @@ import "google/api/annotations.proto";
|
|
|
8
8
|
import "google/protobuf/duration.proto";
|
|
9
9
|
import "google/protobuf/timestamp.proto";
|
|
10
10
|
import "flyteidl/core/identifier.proto";
|
|
11
|
+
import "flyteidl/core/literals.proto";
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
message CreateUploadLocationResponse {
|
|
13
15
|
// SignedUrl specifies the url to use to upload content to (e.g. https://my-bucket.s3.amazonaws.com/randomstring/suffix.tar?X-...)
|
|
@@ -95,6 +97,18 @@ message CreateDownloadLinkRequest {
|
|
|
95
97
|
|
|
96
98
|
// CreateDownloadLinkResponse defines the response for the generated links
|
|
97
99
|
message CreateDownloadLinkResponse {
|
|
100
|
+
// SignedUrl specifies the url to use to download content from (e.g. https://my-bucket.s3.amazonaws.com/randomstring/suffix.tar?X-...)
|
|
101
|
+
repeated string signed_url = 1 [deprecated = true];
|
|
102
|
+
|
|
103
|
+
// ExpiresAt defines when will the signed URL expire.
|
|
104
|
+
google.protobuf.Timestamp expires_at = 2 [deprecated = true];
|
|
105
|
+
|
|
106
|
+
// New wrapper object containing the signed urls and expiration time
|
|
107
|
+
PreSignedURLs pre_signed_urls = 3;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Wrapper object since the message is shared across this and the GetDataResponse
|
|
111
|
+
message PreSignedURLs {
|
|
98
112
|
// SignedUrl specifies the url to use to download content from (e.g. https://my-bucket.s3.amazonaws.com/randomstring/suffix.tar?X-...)
|
|
99
113
|
repeated string signed_url = 1;
|
|
100
114
|
|
|
@@ -102,6 +116,25 @@ message CreateDownloadLinkResponse {
|
|
|
102
116
|
google.protobuf.Timestamp expires_at = 2;
|
|
103
117
|
}
|
|
104
118
|
|
|
119
|
+
// General request artifact to retrieve data from a Flyte artifact url.
|
|
120
|
+
message GetDataRequest {
|
|
121
|
+
// A unique identifier in the form of flyte://<something> that uniquely, for a given Flyte
|
|
122
|
+
// backend, identifies a Flyte artifact ([i]nput, [o]utput, flyte [d]eck, etc.).
|
|
123
|
+
// e.g. flyte://v1/proj/development/execid/n2/0/i (for 0th task execution attempt input)
|
|
124
|
+
// flyte://v1/proj/development/execid/n2/i (for node execution input)
|
|
125
|
+
string flyte_url = 1;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
message GetDataResponse {
|
|
129
|
+
oneof data {
|
|
130
|
+
// literal map data will be returned
|
|
131
|
+
core.LiteralMap literal_map = 1;
|
|
132
|
+
|
|
133
|
+
// Flyte deck html will be returned as a signed url users can download
|
|
134
|
+
PreSignedURLs pre_signed_urls = 2;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
105
138
|
// DataProxyService defines an RPC Service that allows access to user-data in a controlled manner.
|
|
106
139
|
service DataProxyService {
|
|
107
140
|
// CreateUploadLocation creates a signed url to upload artifacts to for a given project/domain.
|
|
@@ -136,4 +169,11 @@ service DataProxyService {
|
|
|
136
169
|
// description: "Creates a read-only http location that is accessible for tasks at runtime."
|
|
137
170
|
// };
|
|
138
171
|
}
|
|
172
|
+
|
|
173
|
+
rpc GetData (GetDataRequest) returns (GetDataResponse) {
|
|
174
|
+
// Takes an address like flyte://v1/proj/development/execid/n2/0/i and return the actual data
|
|
175
|
+
option (google.api.http) = {
|
|
176
|
+
get: "/api/v1/data"
|
|
177
|
+
};
|
|
178
|
+
}
|
|
139
179
|
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
package flyteidl.service;
|
|
3
|
+
|
|
4
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/service";
|
|
5
|
+
import "flyteidl/core/literals.proto";
|
|
6
|
+
import "flyteidl/core/tasks.proto";
|
|
7
|
+
import "flyteidl/core/interface.proto";
|
|
8
|
+
|
|
9
|
+
// ExternalPluginService defines an RPC Service that allows propeller to send the request to the backend plugin server.
|
|
10
|
+
service ExternalPluginService {
|
|
11
|
+
// Send a task create request to the backend plugin server.
|
|
12
|
+
rpc CreateTask (TaskCreateRequest) returns (TaskCreateResponse){};
|
|
13
|
+
// Get job status.
|
|
14
|
+
rpc GetTask (TaskGetRequest) returns (TaskGetResponse){};
|
|
15
|
+
// Delete the task resource.
|
|
16
|
+
rpc DeleteTask (TaskDeleteRequest) returns (TaskDeleteResponse){};
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// The state of the execution is used to control its visibility in the UI/CLI.
|
|
20
|
+
enum State {
|
|
21
|
+
RETRYABLE_FAILURE = 0;
|
|
22
|
+
PERMANENT_FAILURE = 1;
|
|
23
|
+
PENDING = 2;
|
|
24
|
+
RUNNING = 3;
|
|
25
|
+
SUCCEEDED = 4;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Represents a request structure to create task.
|
|
29
|
+
message TaskCreateRequest {
|
|
30
|
+
// The inputs required to start the execution. All required inputs must be
|
|
31
|
+
// included in this map. If not required and not provided, defaults apply.
|
|
32
|
+
// +optional
|
|
33
|
+
core.LiteralMap inputs = 1;
|
|
34
|
+
// Template of the task that encapsulates all the metadata of the task.
|
|
35
|
+
core.TaskTemplate template = 2;
|
|
36
|
+
// Prefix for where task output data will be written. (e.g. s3://my-bucket/randomstring)
|
|
37
|
+
string output_prefix = 3;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Represents a create response structure.
|
|
41
|
+
message TaskCreateResponse {
|
|
42
|
+
string job_id = 1;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// A message used to fetch a job state from backend plugin server.
|
|
46
|
+
message TaskGetRequest {
|
|
47
|
+
// A predefined yet extensible Task type identifier.
|
|
48
|
+
string task_type = 1;
|
|
49
|
+
// The unique id identifying the job.
|
|
50
|
+
string job_id = 2;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Response to get an individual task state.
|
|
54
|
+
message TaskGetResponse {
|
|
55
|
+
// The state of the execution is used to control its visibility in the UI/CLI.
|
|
56
|
+
State state = 1;
|
|
57
|
+
// The outputs of the execution. It's typically used by sql task. Flyteplugins service will create a
|
|
58
|
+
// Structured dataset pointing to the query result table.
|
|
59
|
+
// +optional
|
|
60
|
+
core.LiteralMap outputs = 2;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// A message used to delete a task.
|
|
64
|
+
message TaskDeleteRequest {
|
|
65
|
+
// A predefined yet extensible Task type identifier.
|
|
66
|
+
string task_type = 1;
|
|
67
|
+
// The unique id identifying the job.
|
|
68
|
+
string job_id = 2;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Response to delete a task.
|
|
72
|
+
message TaskDeleteResponse {
|
|
73
|
+
}
|
|
74
|
+
|