@flyteorg/flyteidl 1.5.0 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
enum RestartPolicy {
|
|
9
|
+
RESTART_POLICY_NEVER = 0;
|
|
10
|
+
RESTART_POLICY_ON_FAILURE = 1;
|
|
11
|
+
RESTART_POLICY_ALWAYS = 2;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
enum CleanPodPolicy {
|
|
15
|
+
CLEANPOD_POLICY_NONE = 0;
|
|
16
|
+
CLEANPOD_POLICY_RUNNING = 1;
|
|
17
|
+
CLEANPOD_POLICY_ALL = 2;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
message RunPolicy {
|
|
21
|
+
// Defines the policy to kill pods after the job completes. Default to None.
|
|
22
|
+
CleanPodPolicy clean_pod_policy = 1;
|
|
23
|
+
|
|
24
|
+
// TTL to clean up jobs. Default to infinite.
|
|
25
|
+
int32 ttl_seconds_after_finished = 2;
|
|
26
|
+
|
|
27
|
+
// Specifies the duration in seconds relative to the startTime that the job may be active
|
|
28
|
+
// before the system tries to terminate it; value must be positive integer.
|
|
29
|
+
int32 active_deadline_seconds = 3;
|
|
30
|
+
|
|
31
|
+
// Number of retries before marking this job failed.
|
|
32
|
+
int32 backoff_limit = 4;
|
|
33
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/mpi-operator
|
|
11
|
+
message DistributedMPITrainingTask {
|
|
12
|
+
// Worker replicas spec
|
|
13
|
+
DistributedMPITrainingReplicaSpec worker_replicas = 1;
|
|
14
|
+
|
|
15
|
+
// Master replicas spec
|
|
16
|
+
DistributedMPITrainingReplicaSpec launcher_replicas = 2;
|
|
17
|
+
|
|
18
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
19
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
20
|
+
// active.
|
|
21
|
+
RunPolicy run_policy = 3;
|
|
22
|
+
|
|
23
|
+
// Number of slots per worker
|
|
24
|
+
int32 slots = 4;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Replica specification for distributed MPI training
|
|
28
|
+
message DistributedMPITrainingReplicaSpec {
|
|
29
|
+
// Number of replicas
|
|
30
|
+
int32 replicas = 1;
|
|
31
|
+
|
|
32
|
+
// Image used for the replica group
|
|
33
|
+
string image = 2;
|
|
34
|
+
|
|
35
|
+
// Resources required for the replica group
|
|
36
|
+
core.Resources resources = 3;
|
|
37
|
+
|
|
38
|
+
// Restart policy determines whether pods will be restarted when they exit
|
|
39
|
+
RestartPolicy restart_policy = 4;
|
|
40
|
+
|
|
41
|
+
// MPI sometimes requires different command set for different replica groups
|
|
42
|
+
repeated string command = 5;
|
|
43
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Custom proto for torch elastic config for distributed training using
|
|
11
|
+
// https://github.com/kubeflow/training-operator/blob/master/pkg/apis/kubeflow.org/v1/pytorch_types.go
|
|
12
|
+
message ElasticConfig {
|
|
13
|
+
string rdzv_backend = 1;
|
|
14
|
+
int32 min_replicas = 2;
|
|
15
|
+
int32 max_replicas = 3;
|
|
16
|
+
int32 nproc_per_node = 4;
|
|
17
|
+
int32 max_restarts = 5;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/pytorch-operator
|
|
21
|
+
message DistributedPyTorchTrainingTask {
|
|
22
|
+
// Worker replicas spec
|
|
23
|
+
DistributedPyTorchTrainingReplicaSpec worker_replicas = 1;
|
|
24
|
+
|
|
25
|
+
// Master replicas spec, master replicas can only have 1 replica
|
|
26
|
+
DistributedPyTorchTrainingReplicaSpec master_replicas = 2;
|
|
27
|
+
|
|
28
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
29
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
30
|
+
// active.
|
|
31
|
+
RunPolicy run_policy = 3;
|
|
32
|
+
|
|
33
|
+
// config for an elastic pytorch job
|
|
34
|
+
ElasticConfig elastic_config = 4;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
message DistributedPyTorchTrainingReplicaSpec {
|
|
38
|
+
// Number of replicas
|
|
39
|
+
int32 replicas = 1;
|
|
40
|
+
|
|
41
|
+
// Image used for the replica group
|
|
42
|
+
string image = 2;
|
|
43
|
+
|
|
44
|
+
// Resources required for the replica group
|
|
45
|
+
core.Resources resources = 3;
|
|
46
|
+
|
|
47
|
+
// RestartPolicy determines whether pods will be restarted when they exit
|
|
48
|
+
RestartPolicy restart_policy = 4;
|
|
49
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package flyteidl.plugins.kubeflow;
|
|
4
|
+
|
|
5
|
+
option go_package = "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins";
|
|
6
|
+
|
|
7
|
+
import "flyteidl/core/tasks.proto";
|
|
8
|
+
import "flyteidl/plugins/kubeflow/common.proto";
|
|
9
|
+
|
|
10
|
+
// Proto for plugin that enables distributed training using https://github.com/kubeflow/tf-operator
|
|
11
|
+
message DistributedTensorflowTrainingTask {
|
|
12
|
+
// Worker replicas spec
|
|
13
|
+
DistributedTensorflowTrainingReplicaSpec worker_replicas = 1;
|
|
14
|
+
|
|
15
|
+
// Parameter server replicas spec
|
|
16
|
+
DistributedTensorflowTrainingReplicaSpec ps_replicas = 2;
|
|
17
|
+
|
|
18
|
+
// Chief replicas spec
|
|
19
|
+
DistributedTensorflowTrainingReplicaSpec chief_replicas = 3;
|
|
20
|
+
|
|
21
|
+
// RunPolicy encapsulates various runtime policies of the distributed training
|
|
22
|
+
// job, for example how to clean up resources and how long the job can stay
|
|
23
|
+
// active.
|
|
24
|
+
RunPolicy run_policy = 4;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
message DistributedTensorflowTrainingReplicaSpec {
|
|
28
|
+
// Number of replicas
|
|
29
|
+
int32 replicas = 1;
|
|
30
|
+
|
|
31
|
+
// Image used for the replica group
|
|
32
|
+
string image = 2;
|
|
33
|
+
|
|
34
|
+
// Resources required for the replica group
|
|
35
|
+
core.Resources resources = 3;
|
|
36
|
+
|
|
37
|
+
// RestartPolicy Determines whether pods will be restarted when they exit
|
|
38
|
+
RestartPolicy restart_policy = 4;
|
|
39
|
+
}
|