lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/ai_hub.py +8 -3
- lightning_sdk/api/ai_hub_api.py +3 -3
- lightning_sdk/api/deployment_api.py +6 -6
- lightning_sdk/api/job_api.py +32 -6
- lightning_sdk/api/mmt_api.py +59 -19
- lightning_sdk/api/studio_api.py +37 -19
- lightning_sdk/api/teamspace_api.py +34 -29
- lightning_sdk/api/utils.py +46 -34
- lightning_sdk/cli/ai_hub.py +3 -3
- lightning_sdk/cli/entrypoint.py +3 -1
- lightning_sdk/cli/run.py +122 -12
- lightning_sdk/cli/serve.py +218 -0
- lightning_sdk/deployment/deployment.py +18 -12
- lightning_sdk/job/base.py +118 -24
- lightning_sdk/job/job.py +98 -9
- lightning_sdk/job/v1.py +75 -18
- lightning_sdk/job/v2.py +51 -15
- lightning_sdk/job/work.py +36 -7
- lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
- lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
- lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
- lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
- lightning_sdk/lightning_cloud/rest_client.py +2 -0
- lightning_sdk/mmt/__init__.py +4 -0
- lightning_sdk/mmt/base.py +278 -0
- lightning_sdk/mmt/mmt.py +267 -0
- lightning_sdk/mmt/v1.py +181 -0
- lightning_sdk/mmt/v2.py +188 -0
- lightning_sdk/plugin.py +43 -16
- lightning_sdk/services/file_endpoint.py +11 -5
- lightning_sdk/studio.py +16 -9
- lightning_sdk/teamspace.py +21 -8
- lightning_sdk/utils/resolve.py +18 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
- lightning_sdk/_mmt/__init__.py +0 -3
- lightning_sdk/_mmt/base.py +0 -180
- lightning_sdk/_mmt/mmt.py +0 -161
- lightning_sdk/_mmt/v1.py +0 -69
- lightning_sdk/_mmt/v2.py +0 -141
- lightning_sdk/cli/mmt.py +0 -137
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
lightning_sdk/_mmt/mmt.py
DELETED
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
|
-
|
|
3
|
-
from lightning_sdk._mmt.base import _BaseMMT
|
|
4
|
-
from lightning_sdk._mmt.v1 import _MMTV1
|
|
5
|
-
from lightning_sdk._mmt.v2 import _MMTV2
|
|
6
|
-
from lightning_sdk.job.job import _has_jobs_v2
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from lightning_sdk.job import Job
|
|
10
|
-
from lightning_sdk.machine import Machine
|
|
11
|
-
from lightning_sdk.organization import Organization
|
|
12
|
-
from lightning_sdk.status import Status
|
|
13
|
-
from lightning_sdk.studio import Studio
|
|
14
|
-
from lightning_sdk.teamspace import Teamspace
|
|
15
|
-
from lightning_sdk.user import User
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class MMT(_BaseMMT):
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
name: str,
|
|
22
|
-
teamspace: Union[str, "Teamspace", None] = None,
|
|
23
|
-
org: Union[str, "Organization", None] = None,
|
|
24
|
-
user: Union[str, "User", None] = None,
|
|
25
|
-
*,
|
|
26
|
-
_fetch_job: bool = True,
|
|
27
|
-
) -> None:
|
|
28
|
-
internal_mmt_cls = _MMTV2 if _has_jobs_v2() else _MMTV1
|
|
29
|
-
|
|
30
|
-
self._internal_mmt = internal_mmt_cls(
|
|
31
|
-
name=name,
|
|
32
|
-
teamspace=teamspace,
|
|
33
|
-
org=org,
|
|
34
|
-
user=user,
|
|
35
|
-
_fetch_job=_fetch_job,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def run(
|
|
40
|
-
cls,
|
|
41
|
-
name: str,
|
|
42
|
-
num_machines: int,
|
|
43
|
-
machine: "Machine",
|
|
44
|
-
command: Optional[str] = None,
|
|
45
|
-
studio: Union["Studio", str, None] = None,
|
|
46
|
-
image: Union[str, None] = None,
|
|
47
|
-
teamspace: Union[str, "Teamspace", None] = None,
|
|
48
|
-
org: Union[str, "Organization", None] = None,
|
|
49
|
-
user: Union[str, "User", None] = None,
|
|
50
|
-
cluster: Optional[str] = None,
|
|
51
|
-
env: Optional[Dict[str, str]] = None,
|
|
52
|
-
interruptible: bool = False,
|
|
53
|
-
image_credentials: Optional[str] = None,
|
|
54
|
-
cluster_auth: bool = False,
|
|
55
|
-
artifacts_local: Optional[str] = None,
|
|
56
|
-
artifacts_remote: Optional[str] = None,
|
|
57
|
-
) -> "MMT":
|
|
58
|
-
ret_val = super().run(
|
|
59
|
-
name=name,
|
|
60
|
-
num_machines=num_machines,
|
|
61
|
-
machine=machine,
|
|
62
|
-
command=command,
|
|
63
|
-
studio=studio,
|
|
64
|
-
image=image,
|
|
65
|
-
teamspace=teamspace,
|
|
66
|
-
org=org,
|
|
67
|
-
user=user,
|
|
68
|
-
cluster=cluster,
|
|
69
|
-
env=env,
|
|
70
|
-
interruptible=interruptible,
|
|
71
|
-
image_credentials=image_credentials,
|
|
72
|
-
cluster_auth=cluster_auth,
|
|
73
|
-
artifacts_local=artifacts_local,
|
|
74
|
-
artifacts_remote=artifacts_remote,
|
|
75
|
-
)
|
|
76
|
-
# required for typing with "Job"
|
|
77
|
-
assert isinstance(ret_val, cls)
|
|
78
|
-
return ret_val
|
|
79
|
-
|
|
80
|
-
def _submit(
|
|
81
|
-
self,
|
|
82
|
-
num_machines: int,
|
|
83
|
-
machine: "Machine",
|
|
84
|
-
command: Optional[str] = None,
|
|
85
|
-
studio: Optional["Studio"] = None,
|
|
86
|
-
image: Optional[str] = None,
|
|
87
|
-
env: Optional[Dict[str, str]] = None,
|
|
88
|
-
interruptible: bool = False,
|
|
89
|
-
cluster: Optional[str] = None,
|
|
90
|
-
image_credentials: Optional[str] = None,
|
|
91
|
-
cluster_auth: bool = False,
|
|
92
|
-
artifacts_local: Optional[str] = None,
|
|
93
|
-
artifacts_remote: Optional[str] = None,
|
|
94
|
-
) -> "MMT":
|
|
95
|
-
self._job = self._internal_mmt._submit(
|
|
96
|
-
num_machines=num_machines,
|
|
97
|
-
machine=machine,
|
|
98
|
-
cluster=cluster,
|
|
99
|
-
command=command,
|
|
100
|
-
studio=studio,
|
|
101
|
-
image=image,
|
|
102
|
-
env=env,
|
|
103
|
-
interruptible=interruptible,
|
|
104
|
-
image_credentials=image_credentials,
|
|
105
|
-
cluster_auth=cluster_auth,
|
|
106
|
-
artifacts_local=artifacts_local,
|
|
107
|
-
artifacts_remote=artifacts_remote,
|
|
108
|
-
)
|
|
109
|
-
return self
|
|
110
|
-
|
|
111
|
-
def stop(self) -> None:
|
|
112
|
-
return self._internal_mmt.stop()
|
|
113
|
-
|
|
114
|
-
def delete(self) -> None:
|
|
115
|
-
return self._internal_mmt.delete()
|
|
116
|
-
|
|
117
|
-
@property
|
|
118
|
-
def status(self) -> "Status":
|
|
119
|
-
return self._internal_mmt.status
|
|
120
|
-
|
|
121
|
-
@property
|
|
122
|
-
def machines(self) -> Tuple["Job", ...]:
|
|
123
|
-
return self._internal_mmt.machines
|
|
124
|
-
|
|
125
|
-
@property
|
|
126
|
-
def machine(self) -> "Machine":
|
|
127
|
-
return self._internal_mmt.machine
|
|
128
|
-
|
|
129
|
-
@property
|
|
130
|
-
def artifact_path(self) -> Optional[str]:
|
|
131
|
-
return self._internal_mmt.artifact_path
|
|
132
|
-
|
|
133
|
-
@property
|
|
134
|
-
def snapshot_path(self) -> Optional[str]:
|
|
135
|
-
return self._internal_mmt.snapshot_path
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def share_path(self) -> Optional[str]:
|
|
139
|
-
return None
|
|
140
|
-
|
|
141
|
-
def _update_internal_job(self) -> None:
|
|
142
|
-
return self._internal_mmt._update_internal_job()
|
|
143
|
-
|
|
144
|
-
@property
|
|
145
|
-
def name(self) -> str:
|
|
146
|
-
return self._internal_mmt.name
|
|
147
|
-
|
|
148
|
-
@property
|
|
149
|
-
def teamspace(self) -> "Teamspace":
|
|
150
|
-
return self._internal_mmt._teamspace
|
|
151
|
-
|
|
152
|
-
@property
|
|
153
|
-
def cluster(self) -> Optional[str]:
|
|
154
|
-
return self._internal_mmt.cluster
|
|
155
|
-
|
|
156
|
-
def __getattr__(self, key: str) -> Any:
|
|
157
|
-
"""Forward the attribute lookup to the internal job implementation."""
|
|
158
|
-
try:
|
|
159
|
-
return getattr(super(), key)
|
|
160
|
-
except AttributeError:
|
|
161
|
-
return getattr(self._internal_mmt, key)
|
lightning_sdk/_mmt/v1.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
|
|
2
|
-
|
|
3
|
-
if TYPE_CHECKING:
|
|
4
|
-
from lightning_sdk.job.job import Job
|
|
5
|
-
from lightning_sdk.machine import Machine
|
|
6
|
-
from lightning_sdk.status import Status
|
|
7
|
-
from lightning_sdk.studio import Studio
|
|
8
|
-
from lightning_sdk.teamspace import Teamspace
|
|
9
|
-
|
|
10
|
-
from lightning_sdk._mmt.base import _BaseMMT
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class _MMTV1(_BaseMMT):
|
|
14
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
15
|
-
raise NotImplementedError
|
|
16
|
-
|
|
17
|
-
def _submit(
|
|
18
|
-
self,
|
|
19
|
-
num_machines: int,
|
|
20
|
-
machine: "Machine",
|
|
21
|
-
command: Optional[str] = None,
|
|
22
|
-
studio: Optional["Studio"] = None,
|
|
23
|
-
image: Optional[str] = None,
|
|
24
|
-
env: Optional[Dict[str, str]] = None,
|
|
25
|
-
interruptible: bool = False,
|
|
26
|
-
cluster: Optional[str] = None,
|
|
27
|
-
image_credentials: Optional[str] = None,
|
|
28
|
-
cluster_auth: bool = False,
|
|
29
|
-
artifacts_local: Optional[str] = None,
|
|
30
|
-
artifacts_remote: Optional[str] = None,
|
|
31
|
-
) -> None:
|
|
32
|
-
raise NotImplementedError
|
|
33
|
-
|
|
34
|
-
@property
|
|
35
|
-
def machines(self) -> Tuple["Job", ...]:
|
|
36
|
-
raise NotImplementedError
|
|
37
|
-
|
|
38
|
-
def stop(self) -> None:
|
|
39
|
-
raise NotImplementedError
|
|
40
|
-
|
|
41
|
-
def delete(self) -> None:
|
|
42
|
-
raise NotImplementedError
|
|
43
|
-
|
|
44
|
-
@property
|
|
45
|
-
def status(self) -> "Status":
|
|
46
|
-
raise NotImplementedError
|
|
47
|
-
|
|
48
|
-
@property
|
|
49
|
-
def artifact_path(self) -> Optional[str]:
|
|
50
|
-
raise NotImplementedError
|
|
51
|
-
|
|
52
|
-
@property
|
|
53
|
-
def snapshot_path(self) -> Optional[str]:
|
|
54
|
-
raise NotImplementedError
|
|
55
|
-
|
|
56
|
-
@property
|
|
57
|
-
def machine(self) -> "Machine":
|
|
58
|
-
raise NotImplementedError
|
|
59
|
-
|
|
60
|
-
def _update_internal_job(self) -> None:
|
|
61
|
-
raise NotImplementedError
|
|
62
|
-
|
|
63
|
-
@property
|
|
64
|
-
def name(self) -> str:
|
|
65
|
-
return self._name
|
|
66
|
-
|
|
67
|
-
@property
|
|
68
|
-
def teamspace(self) -> "Teamspace":
|
|
69
|
-
return self._teamspace
|
lightning_sdk/_mmt/v2.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
|
-
|
|
3
|
-
from lightning_sdk.api.mmt_api import MMTApi
|
|
4
|
-
|
|
5
|
-
if TYPE_CHECKING:
|
|
6
|
-
from lightning_sdk.job.job import Job
|
|
7
|
-
from lightning_sdk.machine import Machine
|
|
8
|
-
from lightning_sdk.organization import Organization
|
|
9
|
-
from lightning_sdk.status import Status
|
|
10
|
-
from lightning_sdk.studio import Studio
|
|
11
|
-
from lightning_sdk.teamspace import Teamspace
|
|
12
|
-
from lightning_sdk.user import User
|
|
13
|
-
|
|
14
|
-
from lightning_sdk._mmt.base import _BaseMMT
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class _MMTV2(_BaseMMT):
|
|
18
|
-
def __init__(
|
|
19
|
-
self,
|
|
20
|
-
name: str,
|
|
21
|
-
teamspace: Union[str, "Teamspace", None] = None,
|
|
22
|
-
org: Union[str, "Organization", None] = None,
|
|
23
|
-
user: Union[str, "User", None] = None,
|
|
24
|
-
*,
|
|
25
|
-
_fetch_job: bool = True,
|
|
26
|
-
) -> None:
|
|
27
|
-
self._job_api = MMTApi()
|
|
28
|
-
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
29
|
-
|
|
30
|
-
def _submit(
|
|
31
|
-
self,
|
|
32
|
-
num_machines: int,
|
|
33
|
-
machine: "Machine",
|
|
34
|
-
command: Optional[str] = None,
|
|
35
|
-
studio: Optional["Studio"] = None,
|
|
36
|
-
image: Optional[str] = None,
|
|
37
|
-
env: Optional[Dict[str, str]] = None,
|
|
38
|
-
interruptible: bool = False,
|
|
39
|
-
cluster: Optional[str] = None,
|
|
40
|
-
image_credentials: Optional[str] = None,
|
|
41
|
-
cluster_auth: bool = False,
|
|
42
|
-
artifacts_local: Optional[str] = None,
|
|
43
|
-
artifacts_remote: Optional[str] = None,
|
|
44
|
-
) -> "_MMTV2":
|
|
45
|
-
# Command is required if Studio is provided to know what to run
|
|
46
|
-
# Image is mutually exclusive with Studio
|
|
47
|
-
# Command is optional for Image
|
|
48
|
-
# Either image or studio must be provided
|
|
49
|
-
if studio is not None:
|
|
50
|
-
studio_id = studio._studio.id
|
|
51
|
-
if image is not None:
|
|
52
|
-
raise ValueError(
|
|
53
|
-
"image and studio are mutually exclusive as both define the environment to run the job in"
|
|
54
|
-
)
|
|
55
|
-
if command is None:
|
|
56
|
-
raise ValueError("command is required when using a studio")
|
|
57
|
-
else:
|
|
58
|
-
studio_id = None
|
|
59
|
-
if image is None:
|
|
60
|
-
raise ValueError("either image or studio must be provided")
|
|
61
|
-
submitted = self._job_api.submit_job(
|
|
62
|
-
name=self.name,
|
|
63
|
-
num_machines=num_machines,
|
|
64
|
-
command=command,
|
|
65
|
-
cluster_id=cluster,
|
|
66
|
-
teamspace_id=self._teamspace.id,
|
|
67
|
-
studio_id=studio_id,
|
|
68
|
-
image=image,
|
|
69
|
-
machine=machine,
|
|
70
|
-
interruptible=interruptible,
|
|
71
|
-
env=env,
|
|
72
|
-
image_credentials=image_credentials,
|
|
73
|
-
cluster_auth=cluster_auth,
|
|
74
|
-
artifacts_local=artifacts_local,
|
|
75
|
-
artifacts_remote=artifacts_remote,
|
|
76
|
-
)
|
|
77
|
-
self._job = submitted
|
|
78
|
-
self._name = submitted.name
|
|
79
|
-
return self
|
|
80
|
-
|
|
81
|
-
@property
|
|
82
|
-
def machines(self) -> Tuple["Job", ...]:
|
|
83
|
-
raise NotImplementedError
|
|
84
|
-
|
|
85
|
-
def stop(self) -> None:
|
|
86
|
-
self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
|
|
87
|
-
|
|
88
|
-
def delete(self) -> None:
|
|
89
|
-
self._job_api.delete_job(
|
|
90
|
-
job_id=self._guaranteed_job.id,
|
|
91
|
-
teamspace_id=self._teamspace.id,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
@property
|
|
95
|
-
def _latest_job(self) -> Any:
|
|
96
|
-
"""Guarantees to fetch the latest version of a job before returning it."""
|
|
97
|
-
self._update_internal_job()
|
|
98
|
-
return self._job
|
|
99
|
-
|
|
100
|
-
@property
|
|
101
|
-
def _guaranteed_job(self) -> Any:
|
|
102
|
-
"""Guarantees that the job was fetched at some point before returning it.
|
|
103
|
-
|
|
104
|
-
Doesn't guarantee to have the lastest version of the job. Use _latest_job for that.
|
|
105
|
-
"""
|
|
106
|
-
if getattr(self, "_job", None) is None:
|
|
107
|
-
self._update_internal_job()
|
|
108
|
-
|
|
109
|
-
return self._job
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def status(self) -> "Status":
|
|
113
|
-
# TODO: Should this rather be a list of states from the individual machines?
|
|
114
|
-
return self._job_api._job_state_to_external(self._latest_job.desired_state)
|
|
115
|
-
|
|
116
|
-
@property
|
|
117
|
-
def artifact_path(self) -> Optional[str]:
|
|
118
|
-
raise NotImplementedError
|
|
119
|
-
|
|
120
|
-
@property
|
|
121
|
-
def snapshot_path(self) -> Optional[str]:
|
|
122
|
-
raise NotImplementedError
|
|
123
|
-
|
|
124
|
-
@property
|
|
125
|
-
def machine(self) -> "Machine":
|
|
126
|
-
return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
|
|
127
|
-
|
|
128
|
-
def _update_internal_job(self) -> None:
|
|
129
|
-
if getattr(self, "_job", None) is None:
|
|
130
|
-
self._job = self._job_api.get_job_by_name(name=self._name, teamspace_id=self._teamspace.id)
|
|
131
|
-
return
|
|
132
|
-
|
|
133
|
-
self._job = self._job_api.get_job(job_id=self._job.id, teamspace_id=self._teamspace.id)
|
|
134
|
-
|
|
135
|
-
@property
|
|
136
|
-
def name(self) -> str:
|
|
137
|
-
return self._name
|
|
138
|
-
|
|
139
|
-
@property
|
|
140
|
-
def teamspace(self) -> "Teamspace":
|
|
141
|
-
return self._teamspace
|
lightning_sdk/cli/mmt.py
DELETED
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Optional
|
|
2
|
-
|
|
3
|
-
from fire import Fire
|
|
4
|
-
|
|
5
|
-
from lightning_sdk._mmt import MMT
|
|
6
|
-
from lightning_sdk.api.studio_api import _cloud_url
|
|
7
|
-
from lightning_sdk.lightning_cloud.login import Auth
|
|
8
|
-
from lightning_sdk.machine import Machine
|
|
9
|
-
from lightning_sdk.teamspace import Teamspace
|
|
10
|
-
|
|
11
|
-
_MACHINE_VALUES = tuple([machine.value for machine in Machine])
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MMTCLI:
|
|
15
|
-
"""Command line interface (CLI) to interact with/manage Lightning AI MMT."""
|
|
16
|
-
|
|
17
|
-
def __init__(self) -> None:
|
|
18
|
-
# Need to set the docstring here for f-strings to work.
|
|
19
|
-
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
20
|
-
# and fire does not show values for literals, just that it is a literal.
|
|
21
|
-
docstr = f"""Run async workloads on multiple machines using a docker image.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
name: The name of the job. Needs to be unique within the teamspace.
|
|
25
|
-
num_machines: The number of Machines to run on. Defaults to 2 Machines
|
|
26
|
-
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
|
|
27
|
-
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
28
|
-
If not provided for images, will run the container entrypoint and default command.
|
|
29
|
-
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
30
|
-
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
31
|
-
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
32
|
-
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
33
|
-
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
34
|
-
cluster: The cluster to run the job on. Defaults to the studio cluster if running with studio compute env.
|
|
35
|
-
If not provided will fall back to the teamspaces default cluster.
|
|
36
|
-
env: Environment variables to set inside the job.
|
|
37
|
-
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
38
|
-
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
39
|
-
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
40
|
-
cluster_auth: Whether to authenticate with the cluster to pull the image.
|
|
41
|
-
Required if the registry is part of a cluster provider (e.g. ECR).
|
|
42
|
-
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
43
|
-
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
44
|
-
Only supported for jobs with a docker image compute environment.
|
|
45
|
-
artifacts_remote: The remote storage to persist your artifacts to.
|
|
46
|
-
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
47
|
-
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
48
|
-
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
49
|
-
within it.
|
|
50
|
-
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
51
|
-
Only supported for jobs with a docker image compute environment.
|
|
52
|
-
"""
|
|
53
|
-
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
54
|
-
# might need to switch to explicit cli definition
|
|
55
|
-
self.run.__func__.__doc__ = docstr
|
|
56
|
-
|
|
57
|
-
def login(self) -> None:
|
|
58
|
-
"""Login to Lightning AI Studios."""
|
|
59
|
-
auth = Auth()
|
|
60
|
-
auth.clear()
|
|
61
|
-
|
|
62
|
-
try:
|
|
63
|
-
auth.authenticate()
|
|
64
|
-
except ConnectionError:
|
|
65
|
-
raise RuntimeError(f"Unable to connect to {_cloud_url()}. Please check your internet connection.") from None
|
|
66
|
-
|
|
67
|
-
def logout(self) -> None:
|
|
68
|
-
"""Logout from Lightning AI Studios."""
|
|
69
|
-
auth = Auth()
|
|
70
|
-
auth.clear()
|
|
71
|
-
|
|
72
|
-
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
73
|
-
# see https://github.com/google/python-fire/pull/513
|
|
74
|
-
# might need to move to different cli library
|
|
75
|
-
def run(
|
|
76
|
-
self,
|
|
77
|
-
name: Optional[str] = None,
|
|
78
|
-
num_machines: int = 2,
|
|
79
|
-
machine: Optional[str] = None,
|
|
80
|
-
command: Optional[str] = None,
|
|
81
|
-
studio: Optional[str] = None,
|
|
82
|
-
image: Optional[str] = None,
|
|
83
|
-
teamspace: Optional[str] = None,
|
|
84
|
-
org: Optional[str] = None,
|
|
85
|
-
user: Optional[str] = None,
|
|
86
|
-
cluster: Optional[str] = None,
|
|
87
|
-
env: Optional[Dict[str, str]] = None,
|
|
88
|
-
interruptible: bool = False,
|
|
89
|
-
image_credentials: Optional[str] = None,
|
|
90
|
-
cluster_auth: bool = False,
|
|
91
|
-
artifacts_local: Optional[str] = None,
|
|
92
|
-
artifacts_remote: Optional[str] = None,
|
|
93
|
-
) -> None:
|
|
94
|
-
if name is None:
|
|
95
|
-
from datetime import datetime
|
|
96
|
-
|
|
97
|
-
timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
98
|
-
name = f"mmt-{timestr}"
|
|
99
|
-
|
|
100
|
-
if machine is None:
|
|
101
|
-
# TODO: infer from studio
|
|
102
|
-
machine = "CPU"
|
|
103
|
-
machine_enum = Machine(machine.upper())
|
|
104
|
-
|
|
105
|
-
teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
106
|
-
if cluster is None:
|
|
107
|
-
cluster = teamspace.default_cluster
|
|
108
|
-
|
|
109
|
-
if image is None:
|
|
110
|
-
raise RuntimeError("Currently only docker images are specified")
|
|
111
|
-
MMT.run(
|
|
112
|
-
name=name,
|
|
113
|
-
num_machines=num_machines,
|
|
114
|
-
machine=machine_enum,
|
|
115
|
-
command=command,
|
|
116
|
-
studio=studio,
|
|
117
|
-
image=image,
|
|
118
|
-
teamspace=teamspace,
|
|
119
|
-
org=org,
|
|
120
|
-
user=user,
|
|
121
|
-
cluster=cluster,
|
|
122
|
-
env=env,
|
|
123
|
-
interruptible=interruptible,
|
|
124
|
-
image_credentials=image_credentials,
|
|
125
|
-
cluster_auth=cluster_auth,
|
|
126
|
-
artifacts_local=artifacts_local,
|
|
127
|
-
artifacts_remote=artifacts_remote,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def main_cli() -> None:
|
|
132
|
-
"""CLI entrypoint."""
|
|
133
|
-
Fire(MMTCLI(), name="_mmt")
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
if __name__ == "__main__":
|
|
137
|
-
main_cli()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|