lightning-sdk 0.1.39__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/_mmt/__init__.py +3 -0
- lightning_sdk/_mmt/base.py +180 -0
- lightning_sdk/_mmt/mmt.py +161 -0
- lightning_sdk/_mmt/v1.py +69 -0
- lightning_sdk/_mmt/v2.py +141 -0
- lightning_sdk/api/mmt_api.py +148 -0
- lightning_sdk/api/teamspace_api.py +0 -9
- lightning_sdk/api/utils.py +2 -1
- lightning_sdk/cli/mmt.py +137 -0
- lightning_sdk/job/base.py +2 -3
- lightning_sdk/job/v1.py +2 -1
- lightning_sdk/job/v2.py +6 -9
- lightning_sdk/lightning_cloud/openapi/__init__.py +8 -3
- lightning_sdk/lightning_cloud/openapi/api/assistants_service_api.py +90 -284
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +118 -1
- lightning_sdk/lightning_cloud/openapi/api/secret_service_api.py +5 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +8 -3
- lightning_sdk/lightning_cloud/openapi/models/create_deployment_request_defines_a_spec_for_the_job_that_allows_for_autoscaling_jobs.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/deployments_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/model_id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_aws_direct_v1.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/{project_id_agentmanagedmodels_body.py → v1_body.py} +21 -47
- lightning_sdk/lightning_cloud/openapi/models/v1_deployment.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_deployment_api.py +227 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_deployment_spec.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_header.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_job_spec.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_managed_model.py +55 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_managed_model_abilities.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_model.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_state.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_query_param.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/{v1_list_managed_models_response.py → v1_resource_visibility.py} +23 -23
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_secret_type.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/{v1_delete_managed_model_response.py → v1_update_model_visibility_response.py} +6 -6
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +41 -15
- lightning_sdk/models.py +29 -8
- lightning_sdk/teamspace.py +17 -15
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/METADATA +1 -1
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/RECORD +48 -36
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/entry_points.txt +1 -0
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.39.dist-info → lightning_sdk-0.1.41.dist-info}/top_level.txt +0 -0
lightning_sdk/__init__.py
CHANGED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from lightning_sdk.machine import Machine
|
|
6
|
+
from lightning_sdk.organization import Organization
|
|
7
|
+
from lightning_sdk.status import Status
|
|
8
|
+
from lightning_sdk.studio import Studio
|
|
9
|
+
from lightning_sdk.teamspace import Teamspace
|
|
10
|
+
from lightning_sdk.user import User
|
|
11
|
+
|
|
12
|
+
from lightning_sdk.job.base import _BaseJob
|
|
13
|
+
from lightning_sdk.job.job import Job
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _BaseMMT(_BaseJob):
|
|
17
|
+
@classmethod
|
|
18
|
+
def run(
|
|
19
|
+
cls,
|
|
20
|
+
name: str,
|
|
21
|
+
machine: "Machine",
|
|
22
|
+
num_machines: int,
|
|
23
|
+
command: Optional[str] = None,
|
|
24
|
+
studio: Union["Studio", str, None] = None,
|
|
25
|
+
image: Optional[str] = None,
|
|
26
|
+
teamspace: Union[str, "Teamspace", None] = None,
|
|
27
|
+
org: Union[str, "Organization", None] = None,
|
|
28
|
+
user: Union[str, "User", None] = None,
|
|
29
|
+
cluster: Optional[str] = None,
|
|
30
|
+
env: Optional[Dict[str, str]] = None,
|
|
31
|
+
interruptible: bool = False,
|
|
32
|
+
image_credentials: Optional[str] = None,
|
|
33
|
+
cluster_auth: bool = False,
|
|
34
|
+
artifacts_local: Optional[str] = None,
|
|
35
|
+
artifacts_remote: Optional[str] = None,
|
|
36
|
+
) -> "_BaseMMT":
|
|
37
|
+
from lightning_sdk.studio import Studio
|
|
38
|
+
|
|
39
|
+
if num_machines <= 1:
|
|
40
|
+
raise ValueError("Multi-Machine training cannot be run with less than 2 Machines")
|
|
41
|
+
|
|
42
|
+
if not name:
|
|
43
|
+
raise ValueError("A job needs to have a name!")
|
|
44
|
+
|
|
45
|
+
if image is None:
|
|
46
|
+
if not isinstance(studio, Studio):
|
|
47
|
+
studio = Studio(name=studio, teamspace=teamspace, org=org, user=user, cluster=cluster, create_ok=False)
|
|
48
|
+
|
|
49
|
+
# studio is a Studio instance at this point
|
|
50
|
+
if teamspace is None:
|
|
51
|
+
teamspace = studio.teamspace
|
|
52
|
+
else:
|
|
53
|
+
teamspace_name = teamspace if isinstance(teamspace, str) else teamspace.name
|
|
54
|
+
|
|
55
|
+
if studio.teamspace.name != teamspace_name:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
"Studio teamspace does not match provided teamspace. "
|
|
58
|
+
"Can only run jobs with Studio envs in the teamspace of that Studio."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if cluster is None:
|
|
62
|
+
cluster = studio.cluster
|
|
63
|
+
|
|
64
|
+
if cluster != studio.cluster:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
"Studio cluster does not match provided cluster. "
|
|
67
|
+
"Can only run jobs with Studio envs in the same cluster."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if image_credentials is not None:
|
|
71
|
+
raise ValueError("image_credentials is only supported when using a custom image")
|
|
72
|
+
|
|
73
|
+
if cluster_auth:
|
|
74
|
+
raise ValueError("cluster_auth is only supported when using a custom image")
|
|
75
|
+
|
|
76
|
+
if artifacts_local is not None or artifacts_remote is not None:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"Specifying artifacts persistence is supported for docker images only. "
|
|
79
|
+
"Other jobs will automatically persist artifacts to the teamspace distributed filesystem."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
else:
|
|
83
|
+
if studio is not None:
|
|
84
|
+
raise RuntimeError(
|
|
85
|
+
"image and studio are mutually exclusive as both define the environment to run the job in"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# they either need to specified both or none of them
|
|
89
|
+
if bool(artifacts_local) != bool(artifacts_remote):
|
|
90
|
+
raise ValueError("Artifact persistence requires both artifacts_local and artifacts_remote to be set")
|
|
91
|
+
|
|
92
|
+
if artifacts_remote and len(artifacts_remote.split(":")) != 3:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
"Artifact persistence requires exactly three arguments separated by colon of kind "
|
|
95
|
+
f"<CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>, got {artifacts_local}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
inst = cls(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=False)
|
|
99
|
+
inst._submit(
|
|
100
|
+
num_machines=num_machines,
|
|
101
|
+
machine=machine,
|
|
102
|
+
cluster=cluster,
|
|
103
|
+
command=command,
|
|
104
|
+
studio=studio,
|
|
105
|
+
image=image,
|
|
106
|
+
env=env,
|
|
107
|
+
interruptible=interruptible,
|
|
108
|
+
image_credentials=image_credentials,
|
|
109
|
+
cluster_auth=cluster_auth,
|
|
110
|
+
artifacts_local=artifacts_local,
|
|
111
|
+
artifacts_remote=artifacts_remote,
|
|
112
|
+
)
|
|
113
|
+
return inst
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def _submit(
|
|
117
|
+
self,
|
|
118
|
+
num_machines: int,
|
|
119
|
+
machine: "Machine",
|
|
120
|
+
command: Optional[str] = None,
|
|
121
|
+
studio: Optional["Studio"] = None,
|
|
122
|
+
image: Optional[str] = None,
|
|
123
|
+
env: Optional[Dict[str, str]] = None,
|
|
124
|
+
interruptible: bool = False,
|
|
125
|
+
cluster: Optional[str] = None,
|
|
126
|
+
image_credentials: Optional[str] = None,
|
|
127
|
+
cluster_auth: bool = False,
|
|
128
|
+
artifacts_local: Optional[str] = None,
|
|
129
|
+
artifacts_remote: Optional[str] = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
"""Submits a job and updates the internal _job attribute as well as the _name attribute."""
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
@abstractmethod
|
|
135
|
+
def machines(self) -> Tuple["Job", ...]:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def machine(self) -> "Machine":
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
def stop(self) -> None:
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
@abstractmethod
|
|
148
|
+
def delete(self) -> None:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def status(self) -> "Status":
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def artifact_path(self) -> Optional[str]:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
@abstractmethod
|
|
163
|
+
def snapshot_path(self) -> Optional[str]:
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def share_path(self) -> Optional[str]:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
@abstractmethod
|
|
171
|
+
def _update_internal_job(self) -> None:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def name(self) -> str:
|
|
176
|
+
return self._name
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def teamspace(self) -> "Teamspace":
|
|
180
|
+
return self._teamspace
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
|
+
|
|
3
|
+
from lightning_sdk._mmt.base import _BaseMMT
|
|
4
|
+
from lightning_sdk._mmt.v1 import _MMTV1
|
|
5
|
+
from lightning_sdk._mmt.v2 import _MMTV2
|
|
6
|
+
from lightning_sdk.job.job import _has_jobs_v2
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from lightning_sdk.job import Job
|
|
10
|
+
from lightning_sdk.machine import Machine
|
|
11
|
+
from lightning_sdk.organization import Organization
|
|
12
|
+
from lightning_sdk.status import Status
|
|
13
|
+
from lightning_sdk.studio import Studio
|
|
14
|
+
from lightning_sdk.teamspace import Teamspace
|
|
15
|
+
from lightning_sdk.user import User
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MMT(_BaseMMT):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
name: str,
|
|
22
|
+
teamspace: Union[str, "Teamspace", None] = None,
|
|
23
|
+
org: Union[str, "Organization", None] = None,
|
|
24
|
+
user: Union[str, "User", None] = None,
|
|
25
|
+
*,
|
|
26
|
+
_fetch_job: bool = True,
|
|
27
|
+
) -> None:
|
|
28
|
+
internal_mmt_cls = _MMTV2 if _has_jobs_v2() else _MMTV1
|
|
29
|
+
|
|
30
|
+
self._internal_mmt = internal_mmt_cls(
|
|
31
|
+
name=name,
|
|
32
|
+
teamspace=teamspace,
|
|
33
|
+
org=org,
|
|
34
|
+
user=user,
|
|
35
|
+
_fetch_job=_fetch_job,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def run(
|
|
40
|
+
cls,
|
|
41
|
+
name: str,
|
|
42
|
+
num_machines: int,
|
|
43
|
+
machine: "Machine",
|
|
44
|
+
command: Optional[str] = None,
|
|
45
|
+
studio: Union["Studio", str, None] = None,
|
|
46
|
+
image: Union[str, None] = None,
|
|
47
|
+
teamspace: Union[str, "Teamspace", None] = None,
|
|
48
|
+
org: Union[str, "Organization", None] = None,
|
|
49
|
+
user: Union[str, "User", None] = None,
|
|
50
|
+
cluster: Optional[str] = None,
|
|
51
|
+
env: Optional[Dict[str, str]] = None,
|
|
52
|
+
interruptible: bool = False,
|
|
53
|
+
image_credentials: Optional[str] = None,
|
|
54
|
+
cluster_auth: bool = False,
|
|
55
|
+
artifacts_local: Optional[str] = None,
|
|
56
|
+
artifacts_remote: Optional[str] = None,
|
|
57
|
+
) -> "MMT":
|
|
58
|
+
ret_val = super().run(
|
|
59
|
+
name=name,
|
|
60
|
+
num_machines=num_machines,
|
|
61
|
+
machine=machine,
|
|
62
|
+
command=command,
|
|
63
|
+
studio=studio,
|
|
64
|
+
image=image,
|
|
65
|
+
teamspace=teamspace,
|
|
66
|
+
org=org,
|
|
67
|
+
user=user,
|
|
68
|
+
cluster=cluster,
|
|
69
|
+
env=env,
|
|
70
|
+
interruptible=interruptible,
|
|
71
|
+
image_credentials=image_credentials,
|
|
72
|
+
cluster_auth=cluster_auth,
|
|
73
|
+
artifacts_local=artifacts_local,
|
|
74
|
+
artifacts_remote=artifacts_remote,
|
|
75
|
+
)
|
|
76
|
+
# required for typing with "Job"
|
|
77
|
+
assert isinstance(ret_val, cls)
|
|
78
|
+
return ret_val
|
|
79
|
+
|
|
80
|
+
def _submit(
|
|
81
|
+
self,
|
|
82
|
+
num_machines: int,
|
|
83
|
+
machine: "Machine",
|
|
84
|
+
command: Optional[str] = None,
|
|
85
|
+
studio: Optional["Studio"] = None,
|
|
86
|
+
image: Optional[str] = None,
|
|
87
|
+
env: Optional[Dict[str, str]] = None,
|
|
88
|
+
interruptible: bool = False,
|
|
89
|
+
cluster: Optional[str] = None,
|
|
90
|
+
image_credentials: Optional[str] = None,
|
|
91
|
+
cluster_auth: bool = False,
|
|
92
|
+
artifacts_local: Optional[str] = None,
|
|
93
|
+
artifacts_remote: Optional[str] = None,
|
|
94
|
+
) -> "MMT":
|
|
95
|
+
self._job = self._internal_mmt._submit(
|
|
96
|
+
num_machines=num_machines,
|
|
97
|
+
machine=machine,
|
|
98
|
+
cluster=cluster,
|
|
99
|
+
command=command,
|
|
100
|
+
studio=studio,
|
|
101
|
+
image=image,
|
|
102
|
+
env=env,
|
|
103
|
+
interruptible=interruptible,
|
|
104
|
+
image_credentials=image_credentials,
|
|
105
|
+
cluster_auth=cluster_auth,
|
|
106
|
+
artifacts_local=artifacts_local,
|
|
107
|
+
artifacts_remote=artifacts_remote,
|
|
108
|
+
)
|
|
109
|
+
return self
|
|
110
|
+
|
|
111
|
+
def stop(self) -> None:
|
|
112
|
+
return self._internal_mmt.stop()
|
|
113
|
+
|
|
114
|
+
def delete(self) -> None:
|
|
115
|
+
return self._internal_mmt.delete()
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def status(self) -> "Status":
|
|
119
|
+
return self._internal_mmt.status
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def machines(self) -> Tuple["Job", ...]:
|
|
123
|
+
return self._internal_mmt.machines
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def machine(self) -> "Machine":
|
|
127
|
+
return self._internal_mmt.machine
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def artifact_path(self) -> Optional[str]:
|
|
131
|
+
return self._internal_mmt.artifact_path
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def snapshot_path(self) -> Optional[str]:
|
|
135
|
+
return self._internal_mmt.snapshot_path
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def share_path(self) -> Optional[str]:
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
def _update_internal_job(self) -> None:
|
|
142
|
+
return self._internal_mmt._update_internal_job()
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def name(self) -> str:
|
|
146
|
+
return self._internal_mmt.name
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def teamspace(self) -> "Teamspace":
|
|
150
|
+
return self._internal_mmt._teamspace
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def cluster(self) -> Optional[str]:
|
|
154
|
+
return self._internal_mmt.cluster
|
|
155
|
+
|
|
156
|
+
def __getattr__(self, key: str) -> Any:
|
|
157
|
+
"""Forward the attribute lookup to the internal job implementation."""
|
|
158
|
+
try:
|
|
159
|
+
return getattr(super(), key)
|
|
160
|
+
except AttributeError:
|
|
161
|
+
return getattr(self._internal_mmt, key)
|
lightning_sdk/_mmt/v1.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from lightning_sdk.job.job import Job
|
|
5
|
+
from lightning_sdk.machine import Machine
|
|
6
|
+
from lightning_sdk.status import Status
|
|
7
|
+
from lightning_sdk.studio import Studio
|
|
8
|
+
from lightning_sdk.teamspace import Teamspace
|
|
9
|
+
|
|
10
|
+
from lightning_sdk._mmt.base import _BaseMMT
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _MMTV1(_BaseMMT):
|
|
14
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
15
|
+
raise NotImplementedError
|
|
16
|
+
|
|
17
|
+
def _submit(
|
|
18
|
+
self,
|
|
19
|
+
num_machines: int,
|
|
20
|
+
machine: "Machine",
|
|
21
|
+
command: Optional[str] = None,
|
|
22
|
+
studio: Optional["Studio"] = None,
|
|
23
|
+
image: Optional[str] = None,
|
|
24
|
+
env: Optional[Dict[str, str]] = None,
|
|
25
|
+
interruptible: bool = False,
|
|
26
|
+
cluster: Optional[str] = None,
|
|
27
|
+
image_credentials: Optional[str] = None,
|
|
28
|
+
cluster_auth: bool = False,
|
|
29
|
+
artifacts_local: Optional[str] = None,
|
|
30
|
+
artifacts_remote: Optional[str] = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
raise NotImplementedError
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def machines(self) -> Tuple["Job", ...]:
|
|
36
|
+
raise NotImplementedError
|
|
37
|
+
|
|
38
|
+
def stop(self) -> None:
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
|
|
41
|
+
def delete(self) -> None:
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def status(self) -> "Status":
|
|
46
|
+
raise NotImplementedError
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def artifact_path(self) -> Optional[str]:
|
|
50
|
+
raise NotImplementedError
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def snapshot_path(self) -> Optional[str]:
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def machine(self) -> "Machine":
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
60
|
+
def _update_internal_job(self) -> None:
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def name(self) -> str:
|
|
65
|
+
return self._name
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def teamspace(self) -> "Teamspace":
|
|
69
|
+
return self._teamspace
|
lightning_sdk/_mmt/v2.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
|
+
|
|
3
|
+
from lightning_sdk.api.mmt_api import MMTApi
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from lightning_sdk.job.job import Job
|
|
7
|
+
from lightning_sdk.machine import Machine
|
|
8
|
+
from lightning_sdk.organization import Organization
|
|
9
|
+
from lightning_sdk.status import Status
|
|
10
|
+
from lightning_sdk.studio import Studio
|
|
11
|
+
from lightning_sdk.teamspace import Teamspace
|
|
12
|
+
from lightning_sdk.user import User
|
|
13
|
+
|
|
14
|
+
from lightning_sdk._mmt.base import _BaseMMT
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _MMTV2(_BaseMMT):
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
name: str,
|
|
21
|
+
teamspace: Union[str, "Teamspace", None] = None,
|
|
22
|
+
org: Union[str, "Organization", None] = None,
|
|
23
|
+
user: Union[str, "User", None] = None,
|
|
24
|
+
*,
|
|
25
|
+
_fetch_job: bool = True,
|
|
26
|
+
) -> None:
|
|
27
|
+
self._job_api = MMTApi()
|
|
28
|
+
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
29
|
+
|
|
30
|
+
def _submit(
|
|
31
|
+
self,
|
|
32
|
+
num_machines: int,
|
|
33
|
+
machine: "Machine",
|
|
34
|
+
command: Optional[str] = None,
|
|
35
|
+
studio: Optional["Studio"] = None,
|
|
36
|
+
image: Optional[str] = None,
|
|
37
|
+
env: Optional[Dict[str, str]] = None,
|
|
38
|
+
interruptible: bool = False,
|
|
39
|
+
cluster: Optional[str] = None,
|
|
40
|
+
image_credentials: Optional[str] = None,
|
|
41
|
+
cluster_auth: bool = False,
|
|
42
|
+
artifacts_local: Optional[str] = None,
|
|
43
|
+
artifacts_remote: Optional[str] = None,
|
|
44
|
+
) -> "_MMTV2":
|
|
45
|
+
# Command is required if Studio is provided to know what to run
|
|
46
|
+
# Image is mutually exclusive with Studio
|
|
47
|
+
# Command is optional for Image
|
|
48
|
+
# Either image or studio must be provided
|
|
49
|
+
if studio is not None:
|
|
50
|
+
studio_id = studio._studio.id
|
|
51
|
+
if image is not None:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"image and studio are mutually exclusive as both define the environment to run the job in"
|
|
54
|
+
)
|
|
55
|
+
if command is None:
|
|
56
|
+
raise ValueError("command is required when using a studio")
|
|
57
|
+
else:
|
|
58
|
+
studio_id = None
|
|
59
|
+
if image is None:
|
|
60
|
+
raise ValueError("either image or studio must be provided")
|
|
61
|
+
submitted = self._job_api.submit_job(
|
|
62
|
+
name=self.name,
|
|
63
|
+
num_machines=num_machines,
|
|
64
|
+
command=command,
|
|
65
|
+
cluster_id=cluster,
|
|
66
|
+
teamspace_id=self._teamspace.id,
|
|
67
|
+
studio_id=studio_id,
|
|
68
|
+
image=image,
|
|
69
|
+
machine=machine,
|
|
70
|
+
interruptible=interruptible,
|
|
71
|
+
env=env,
|
|
72
|
+
image_credentials=image_credentials,
|
|
73
|
+
cluster_auth=cluster_auth,
|
|
74
|
+
artifacts_local=artifacts_local,
|
|
75
|
+
artifacts_remote=artifacts_remote,
|
|
76
|
+
)
|
|
77
|
+
self._job = submitted
|
|
78
|
+
self._name = submitted.name
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def machines(self) -> Tuple["Job", ...]:
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
def stop(self) -> None:
|
|
86
|
+
self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
|
|
87
|
+
|
|
88
|
+
def delete(self) -> None:
|
|
89
|
+
self._job_api.delete_job(
|
|
90
|
+
job_id=self._guaranteed_job.id,
|
|
91
|
+
teamspace_id=self._teamspace.id,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def _latest_job(self) -> Any:
|
|
96
|
+
"""Guarantees to fetch the latest version of a job before returning it."""
|
|
97
|
+
self._update_internal_job()
|
|
98
|
+
return self._job
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def _guaranteed_job(self) -> Any:
|
|
102
|
+
"""Guarantees that the job was fetched at some point before returning it.
|
|
103
|
+
|
|
104
|
+
Doesn't guarantee to have the lastest version of the job. Use _latest_job for that.
|
|
105
|
+
"""
|
|
106
|
+
if getattr(self, "_job", None) is None:
|
|
107
|
+
self._update_internal_job()
|
|
108
|
+
|
|
109
|
+
return self._job
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def status(self) -> "Status":
|
|
113
|
+
# TODO: Should this rather be a list of states from the individual machines?
|
|
114
|
+
return self._job_api._job_state_to_external(self._latest_job.desired_state)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def artifact_path(self) -> Optional[str]:
|
|
118
|
+
raise NotImplementedError
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def snapshot_path(self) -> Optional[str]:
|
|
122
|
+
raise NotImplementedError
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def machine(self) -> "Machine":
|
|
126
|
+
return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
|
|
127
|
+
|
|
128
|
+
def _update_internal_job(self) -> None:
|
|
129
|
+
if getattr(self, "_job", None) is None:
|
|
130
|
+
self._job = self._job_api.get_job_by_name(name=self._name, teamspace_id=self._teamspace.id)
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
self._job = self._job_api.get_job(job_id=self._job.id, teamspace_id=self._teamspace.id)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def name(self) -> str:
|
|
137
|
+
return self._name
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def teamspace(self) -> "Teamspace":
|
|
141
|
+
return self._teamspace
|