@flyteorg/flyteidl 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flyteorg/flyteidl",
3
- "version": "1.5.0",
3
+ "version": "1.5.1",
4
4
  "description": "Compiled protocol buffers and gRPC service clients/servers for Flyte IDLs",
5
5
  "repository": {
6
6
  "type": "git",
@@ -0,0 +1,33 @@
1
+ syntax = "proto3";
2
+
3
+ package flyteidl.plugins.kubeflow;
4
+
5
+ option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
6
+
7
+
8
+ enum RestartPolicy {
9
+ RESTART_POLICY_NEVER = 0;
10
+ RESTART_POLICY_ON_FAILURE = 1;
11
+ RESTART_POLICY_ALWAYS = 2;
12
+ }
13
+
14
+ enum CleanPodPolicy {
15
+ CLEANPOD_POLICY_NONE = 0;
16
+ CLEANPOD_POLICY_RUNNING = 1;
17
+ CLEANPOD_POLICY_ALL = 2;
18
+ }
19
+
20
+ message RunPolicy {
21
+ // Defines the policy to kill pods after the job completes. Default to None.
22
+ CleanPodPolicy clean_pod_policy = 1;
23
+
24
+ // TTL to clean up jobs. Default to infinite.
25
+ int32 ttl_seconds_after_finished = 2;
26
+
27
+ // Specifies the duration in seconds relative to the startTime that the job may be active
28
+ // before the system tries to terminate it; value must be positive integer.
29
+ int32 active_deadline_seconds = 3;
30
+
31
+ // Number of retries before marking this job failed.
32
+ int32 backoff_limit = 4;
33
+ }
@@ -0,0 +1,43 @@
1
+ syntax = "proto3";
2
+
3
+ package flyteidl.plugins.kubeflow;
4
+
5
+ option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
6
+
7
+ import "flyteidl/core/tasks.proto";
8
+ import "flyteidl/plugins/kubeflow/common.proto";
9
+
10
+ // Proto for plugin that enables distributed training using https://github.com/kubeflow/mpi-operator
11
+ message DistributedMPITrainingTask {
12
+ // Worker replicas spec
13
+ DistributedMPITrainingReplicaSpec worker_replicas = 1;
14
+
15
+ // Master replicas spec
16
+ DistributedMPITrainingReplicaSpec launcher_replicas = 2;
17
+
18
+ // RunPolicy encapsulates various runtime policies of the distributed training
19
+ // job, for example how to clean up resources and how long the job can stay
20
+ // active.
21
+ RunPolicy run_policy = 3;
22
+
23
+ // Number of slots per worker
24
+ int32 slots = 4;
25
+ }
26
+
27
+ // Replica specification for distributed MPI training
28
+ message DistributedMPITrainingReplicaSpec {
29
+ // Number of replicas
30
+ int32 replicas = 1;
31
+
32
+ // Image used for the replica group
33
+ string image = 2;
34
+
35
+ // Resources required for the replica group
36
+ core.Resources resources = 3;
37
+
38
+ // Restart policy determines whether pods will be restarted when they exit
39
+ RestartPolicy restart_policy = 4;
40
+
41
+ // MPI sometimes requires different command set for different replica groups
42
+ repeated string command = 5;
43
+ }
@@ -0,0 +1,49 @@
1
+ syntax = "proto3";
2
+
3
+ package flyteidl.plugins.kubeflow;
4
+
5
+ option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
6
+
7
+ import "flyteidl/core/tasks.proto";
8
+ import "flyteidl/plugins/kubeflow/common.proto";
9
+
10
+ // Custom proto for torch elastic config for distributed training using
11
+ // https://github.com/kubeflow/training-operator/blob/master/pkg/apis/kubeflow.org/v1/pytorch_types.go
12
+ message ElasticConfig {
13
+ string rdzv_backend = 1;
14
+ int32 min_replicas = 2;
15
+ int32 max_replicas = 3;
16
+ int32 nproc_per_node = 4;
17
+ int32 max_restarts = 5;
18
+ }
19
+
20
+ // Proto for plugin that enables distributed training using https://github.com/kubeflow/pytorch-operator
21
+ message DistributedPyTorchTrainingTask {
22
+ // Worker replicas spec
23
+ DistributedPyTorchTrainingReplicaSpec worker_replicas = 1;
24
+
25
+ // Master replicas spec, master replicas can only have 1 replica
26
+ DistributedPyTorchTrainingReplicaSpec master_replicas = 2;
27
+
28
+ // RunPolicy encapsulates various runtime policies of the distributed training
29
+ // job, for example how to clean up resources and how long the job can stay
30
+ // active.
31
+ RunPolicy run_policy = 3;
32
+
33
+ // config for an elastic pytorch job
34
+ ElasticConfig elastic_config = 4;
35
+ }
36
+
37
+ message DistributedPyTorchTrainingReplicaSpec {
38
+ // Number of replicas
39
+ int32 replicas = 1;
40
+
41
+ // Image used for the replica group
42
+ string image = 2;
43
+
44
+ // Resources required for the replica group
45
+ core.Resources resources = 3;
46
+
47
+ // RestartPolicy determines whether pods will be restarted when they exit
48
+ RestartPolicy restart_policy = 4;
49
+ }
@@ -0,0 +1,39 @@
1
+ syntax = "proto3";
2
+
3
+ package flyteidl.plugins.kubeflow;
4
+
5
+ option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
6
+
7
+ import "flyteidl/core/tasks.proto";
8
+ import "flyteidl/plugins/kubeflow/common.proto";
9
+
10
+ // Proto for plugin that enables distributed training using https://github.com/kubeflow/tf-operator
11
+ message DistributedTensorflowTrainingTask {
12
+ // Worker replicas spec
13
+ DistributedTensorflowTrainingReplicaSpec worker_replicas = 1;
14
+
15
+ // Parameter server replicas spec
16
+ DistributedTensorflowTrainingReplicaSpec ps_replicas = 2;
17
+
18
+ // Chief replicas spec
19
+ DistributedTensorflowTrainingReplicaSpec chief_replicas = 3;
20
+
21
+ // RunPolicy encapsulates various runtime policies of the distributed training
22
+ // job, for example how to clean up resources and how long the job can stay
23
+ // active.
24
+ RunPolicy run_policy = 4;
25
+ }
26
+
27
+ message DistributedTensorflowTrainingReplicaSpec {
28
+ // Number of replicas
29
+ int32 replicas = 1;
30
+
31
+ // Image used for the replica group
32
+ string image = 2;
33
+
34
+ // Resources required for the replica group
35
+ core.Resources resources = 3;
36
+
37
+ // RestartPolicy Determines whether pods will be restarted when they exit
38
+ RestartPolicy restart_policy = 4;
39
+ }