lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. lightning_sdk/__init__.py +1 -1
  2. lightning_sdk/ai_hub.py +8 -3
  3. lightning_sdk/api/ai_hub_api.py +3 -3
  4. lightning_sdk/api/deployment_api.py +6 -6
  5. lightning_sdk/api/job_api.py +32 -6
  6. lightning_sdk/api/mmt_api.py +59 -19
  7. lightning_sdk/api/studio_api.py +37 -19
  8. lightning_sdk/api/teamspace_api.py +34 -29
  9. lightning_sdk/api/utils.py +46 -34
  10. lightning_sdk/cli/ai_hub.py +3 -3
  11. lightning_sdk/cli/entrypoint.py +3 -1
  12. lightning_sdk/cli/run.py +122 -12
  13. lightning_sdk/cli/serve.py +218 -0
  14. lightning_sdk/deployment/deployment.py +18 -12
  15. lightning_sdk/job/base.py +118 -24
  16. lightning_sdk/job/job.py +98 -9
  17. lightning_sdk/job/v1.py +75 -18
  18. lightning_sdk/job/v2.py +51 -15
  19. lightning_sdk/job/work.py +36 -7
  20. lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
  21. lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
  22. lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
  23. lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
  24. lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
  25. lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
  26. lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
  27. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
  28. lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
  29. lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
  30. lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
  31. lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
  32. lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
  33. lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
  34. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
  35. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
  36. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
  37. lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
  38. lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
  39. lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
  40. lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
  41. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
  42. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
  43. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
  44. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
  45. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
  46. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
  47. lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
  48. lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
  49. lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
  50. lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
  51. lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
  52. lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
  53. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
  54. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
  55. lightning_sdk/lightning_cloud/rest_client.py +2 -0
  56. lightning_sdk/mmt/__init__.py +4 -0
  57. lightning_sdk/mmt/base.py +278 -0
  58. lightning_sdk/mmt/mmt.py +267 -0
  59. lightning_sdk/mmt/v1.py +181 -0
  60. lightning_sdk/mmt/v2.py +188 -0
  61. lightning_sdk/plugin.py +43 -16
  62. lightning_sdk/services/file_endpoint.py +11 -5
  63. lightning_sdk/studio.py +16 -9
  64. lightning_sdk/teamspace.py +21 -8
  65. lightning_sdk/utils/resolve.py +18 -0
  66. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
  67. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
  68. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
  69. lightning_sdk/_mmt/__init__.py +0 -3
  70. lightning_sdk/_mmt/base.py +0 -180
  71. lightning_sdk/_mmt/mmt.py +0 -161
  72. lightning_sdk/_mmt/v1.py +0 -69
  73. lightning_sdk/_mmt/v2.py +0 -141
  74. lightning_sdk/cli/mmt.py +0 -137
  75. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
  76. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
  77. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,181 @@
1
+ from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
2
+
3
+ from lightning_sdk.api.mmt_api import MMTApiV1
4
+ from lightning_sdk.job.v1 import _internal_status_to_external_status
5
+ from lightning_sdk.job.work import Work
6
+
7
+ if TYPE_CHECKING:
8
+ from lightning_sdk.machine import Machine
9
+ from lightning_sdk.organization import Organization
10
+ from lightning_sdk.status import Status
11
+ from lightning_sdk.studio import Studio
12
+ from lightning_sdk.teamspace import Teamspace
13
+ from lightning_sdk.user import User
14
+
15
+ from lightning_sdk.mmt.base import _BaseMMT
16
+
17
+
18
+ class _MMTV1(_BaseMMT):
19
+ """V1 Implementation of Multi-Machine Training."""
20
+
21
+ def __init__(
22
+ self,
23
+ name: str,
24
+ teamspace: Union[str, "Teamspace", None] = None,
25
+ org: Union[str, "Organization", None] = None,
26
+ user: Union[str, "User", None] = None,
27
+ *,
28
+ _fetch_job: bool = True,
29
+ ) -> None:
30
+ """Fetch already existing jobs.
31
+
32
+ Args:
33
+ name: the name of the job
34
+ teamspace: the teamspace the job is part of
35
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
36
+ user: the name of the user owning the :param`teamspace`
37
+ in case it is owned directly by a user instead of an org.
38
+ """
39
+ self._job_api = MMTApiV1()
40
+ super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
41
+
42
+ def _submit(
43
+ self,
44
+ num_machines: int,
45
+ machine: "Machine",
46
+ command: Optional[str] = None,
47
+ studio: Optional["Studio"] = None,
48
+ image: Optional[str] = None,
49
+ env: Optional[Dict[str, str]] = None,
50
+ interruptible: bool = False,
51
+ cloud_account: Optional[str] = None,
52
+ image_credentials: Optional[str] = None,
53
+ cloud_account_auth: bool = False,
54
+ artifacts_local: Optional[str] = None,
55
+ artifacts_remote: Optional[str] = None,
56
+ ) -> "_MMTV1":
57
+ """Submit a new multi-machine job to the Lightning AI platform.
58
+
59
+ Args:
60
+ num_machines: The number of machines to run on.
61
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
62
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
63
+ If not provided for images, will run the container entrypoint and default command.
64
+ studio: The studio env to run the job with. Mutually exclusive with image.
65
+ image: The docker image to run the job with. Mutually exclusive with studio.
66
+ env: Environment variables to set inside the job.
67
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
68
+ cloud_account: The cloud account to run the job on.
69
+ Defaults to the studio cloud account if running with studio compute env.
70
+ If not provided will fall back to the teamspaces default cloud account.
71
+ image_credentials: The credentials used to pull the image. Required if the image is private.
72
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
73
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
74
+ Required if the registry is part of a cloud provider (e.g. ECR).
75
+ artifacts_local: The path of inside the docker container, you want to persist images from.
76
+ CAUTION: When setting this to "/", it will effectively erase your container.
77
+ Only supported for jobs with a docker image compute environment.
78
+ artifacts_remote: The remote storage to persist your artifacts to.
79
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
80
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
81
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
82
+ within it.
83
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
84
+ Only supported for jobs with a docker image compute environment.
85
+ """
86
+ if studio is None:
87
+ raise ValueError("Studio is required for submitting jobs")
88
+ if image is not None or image_credentials is not None or cloud_account_auth:
89
+ raise ValueError("Image is not supported for submitting jobs")
90
+
91
+ if artifacts_local is not None or artifacts_remote is not None:
92
+ raise ValueError("Specifying how to persist artifacts is not yet supported with jobs")
93
+
94
+ if env is not None:
95
+ raise ValueError("Environment variables are not supported for submitting jobs")
96
+ if command is None:
97
+ raise ValueError("Command is required for submitting multi-machine jobs")
98
+
99
+ _submitted = self._job_api.submit_job(
100
+ name=self._name,
101
+ num_machines=num_machines,
102
+ command=command,
103
+ studio_id=studio._studio.id,
104
+ teamspace_id=self._teamspace.id,
105
+ cloud_account=cloud_account or "",
106
+ machine=machine,
107
+ interruptible=interruptible,
108
+ strategy="parallel",
109
+ )
110
+
111
+ self._name = _submitted.name
112
+ self._job = _submitted
113
+ return self
114
+
115
+ def _update_internal_job(self) -> None:
116
+ try:
117
+ self._job = self._job_api.get_job(self._name, self.teamspace.id)
118
+ except ValueError as e:
119
+ raise ValueError(f"Job {self._name} does not exist in Teamspace {self.teamspace.name}") from e
120
+
121
+ @property
122
+ def machines(self) -> Tuple["Work", ...]:
123
+ """Returns the sub-jobs for each individual instance."""
124
+ works = self._job_api.list_works(self._guaranteed_job.id, self.teamspace.id)
125
+
126
+ return tuple(Work(w.id, self, self.teamspace) for w in works)
127
+
128
+ def stop(self) -> None:
129
+ """Stops the job."""
130
+ self._job_api.stop_job(self._guaranteed_job.id, self.teamspace.id)
131
+
132
+ def delete(self) -> None:
133
+ """Deletes the job.
134
+
135
+ Caution: This also deletes all artifacts and snapshots associated with the job.
136
+ """
137
+ self._job_api.delete_job(self._guaranteed_job.id, self.teamspace.id)
138
+
139
+ @property
140
+ def status(self) -> "Status":
141
+ """The current status of the job."""
142
+ try:
143
+ status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
144
+ return _internal_status_to_external_status(status)
145
+ except Exception:
146
+ raise RuntimeError(
147
+ f"MMT {self._name} does not exist in Teamspace {self.teamspace.name}. Did you delete it?"
148
+ ) from None
149
+
150
+ @property
151
+ def artifact_path(self) -> Optional[str]:
152
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
153
+ return f"/teamspace/jobs/{self.name}"
154
+
155
+ @property
156
+ def snapshot_path(self) -> Optional[str]:
157
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
158
+ return f"/teamspace/jobs/{self.name}/snapshot"
159
+
160
+ @property
161
+ def machine(self) -> "Machine":
162
+ """Returns the machine type this job is running on."""
163
+ return self.machines[0].machine
164
+
165
+ @property
166
+ def name(self) -> str:
167
+ """The job's name."""
168
+ return self._name
169
+
170
+ @property
171
+ def teamspace(self) -> "Teamspace":
172
+ """The teamspace the job is part of."""
173
+ return self._teamspace
174
+
175
+ # the following and functions are solely to make the Work class function
176
+ @property
177
+ def _id(self) -> str:
178
+ return self._guaranteed_job.id
179
+
180
+ def _name_filter(self, name: str) -> str:
181
+ return name.replace("root.", "")
@@ -0,0 +1,188 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
2
+
3
+ from lightning_sdk.api.mmt_api import MMTApiV2
4
+
5
+ if TYPE_CHECKING:
6
+ from lightning_sdk.job.job import Job
7
+ from lightning_sdk.machine import Machine
8
+ from lightning_sdk.organization import Organization
9
+ from lightning_sdk.status import Status
10
+ from lightning_sdk.studio import Studio
11
+ from lightning_sdk.teamspace import Teamspace
12
+ from lightning_sdk.user import User
13
+
14
+ from lightning_sdk.mmt.base import _BaseMMT
15
+
16
+
17
+ class _MMTV2(_BaseMMT):
18
+ """New implementation of Multi-Machine Training."""
19
+
20
+ def __init__(
21
+ self,
22
+ name: str,
23
+ teamspace: Union[str, "Teamspace", None] = None,
24
+ org: Union[str, "Organization", None] = None,
25
+ user: Union[str, "User", None] = None,
26
+ *,
27
+ _fetch_job: bool = True,
28
+ ) -> None:
29
+ """Fetch already existing jobs.
30
+
31
+ Args:
32
+ name: the name of the job
33
+ teamspace: the teamspace the job is part of
34
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
35
+ user: the name of the user owning the :param`teamspace`
36
+ in case it is owned directly by a user instead of an org.
37
+ """
38
+ self._job_api = MMTApiV2()
39
+ super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
40
+
41
+ def _submit(
42
+ self,
43
+ num_machines: int,
44
+ machine: "Machine",
45
+ command: Optional[str] = None,
46
+ studio: Optional["Studio"] = None,
47
+ image: Optional[str] = None,
48
+ env: Optional[Dict[str, str]] = None,
49
+ interruptible: bool = False,
50
+ cloud_account: Optional[str] = None,
51
+ image_credentials: Optional[str] = None,
52
+ cloud_account_auth: bool = False,
53
+ artifacts_local: Optional[str] = None,
54
+ artifacts_remote: Optional[str] = None,
55
+ ) -> "_MMTV2":
56
+ """Submit a new multi-machine job to the Lightning AI platform.
57
+
58
+ Args:
59
+ num_machines: The number of machines to run on.
60
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
61
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
62
+ If not provided for images, will run the container entrypoint and default command.
63
+ studio: The studio env to run the job with. Mutually exclusive with image.
64
+ image: The docker image to run the job with. Mutually exclusive with studio.
65
+ env: Environment variables to set inside the job.
66
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
67
+ cloud_account: The cloud account to run the job on.
68
+ Defaults to the studio cloud account if running with studio compute env.
69
+ If not provided will fall back to the teamspaces default cloud account.
70
+ image_credentials: The credentials used to pull the image. Required if the image is private.
71
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
72
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
73
+ Required if the registry is part of a cloud provider (e.g. ECR).
74
+ artifacts_local: The path of inside the docker container, you want to persist images from.
75
+ CAUTION: When setting this to "/", it will effectively erase your container.
76
+ Only supported for jobs with a docker image compute environment.
77
+ artifacts_remote: The remote storage to persist your artifacts to.
78
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
79
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
80
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
81
+ within it.
82
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
83
+ Only supported for jobs with a docker image compute environment.
84
+ """
85
+ # Command is required if Studio is provided to know what to run
86
+ # Image is mutually exclusive with Studio
87
+ # Command is optional for Image
88
+ # Either image or studio must be provided
89
+ if studio is not None:
90
+ studio_id = studio._studio.id
91
+ if image is not None:
92
+ raise ValueError(
93
+ "image and studio are mutually exclusive as both define the environment to run the job in"
94
+ )
95
+ if command is None:
96
+ raise ValueError("command is required when using a studio")
97
+ else:
98
+ studio_id = None
99
+ if image is None:
100
+ raise ValueError("either image or studio must be provided")
101
+ submitted = self._job_api.submit_job(
102
+ name=self.name,
103
+ num_machines=num_machines,
104
+ command=command,
105
+ cloud_account=cloud_account,
106
+ teamspace_id=self._teamspace.id,
107
+ studio_id=studio_id,
108
+ image=image,
109
+ machine=machine,
110
+ interruptible=interruptible,
111
+ env=env,
112
+ image_credentials=image_credentials,
113
+ cloud_account_auth=cloud_account_auth,
114
+ artifacts_local=artifacts_local,
115
+ artifacts_remote=artifacts_remote,
116
+ )
117
+ self._job = submitted
118
+ self._name = submitted.name
119
+ return self
120
+
121
+ @property
122
+ def machines(self) -> Tuple["Job", ...]:
123
+ """Returns the sub-jobs for each individual instance."""
124
+ from lightning_sdk.job import Job
125
+
126
+ return tuple(
127
+ Job(name=j.name, teamspace=self.teamspace)
128
+ for j in self._job_api.list_mmt_subjobs(self._guaranteed_job.id, self.teamspace.id)
129
+ )
130
+
131
+ def stop(self) -> None:
132
+ """Stops the job."""
133
+ self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
134
+
135
+ def delete(self) -> None:
136
+ """Deletes the job.
137
+
138
+ Caution: This also deletes all artifacts and snapshots associated with the job.
139
+ """
140
+ self._job_api.delete_job(
141
+ job_id=self._guaranteed_job.id,
142
+ teamspace_id=self._teamspace.id,
143
+ )
144
+
145
+ @property
146
+ def _latest_job(self) -> Any:
147
+ """Guarantees to fetch the latest version of a job before returning it."""
148
+ self._update_internal_job()
149
+ return self._job
150
+
151
+ @property
152
+ def status(self) -> "Status":
153
+ """The current status of the job."""
154
+ return self._job_api._job_state_to_external(self._latest_job.state)
155
+
156
+ @property
157
+ def artifact_path(self) -> Optional[str]:
158
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
159
+ # TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
160
+ raise NotImplementedError
161
+
162
+ @property
163
+ def snapshot_path(self) -> Optional[str]:
164
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
165
+ # TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
166
+ raise NotImplementedError
167
+
168
+ @property
169
+ def machine(self) -> "Machine":
170
+ """Returns the machine type this job is running on."""
171
+ return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
172
+
173
+ def _update_internal_job(self) -> None:
174
+ if getattr(self, "_job", None) is None:
175
+ self._job = self._job_api.get_job_by_name(name=self._name, teamspace_id=self._teamspace.id)
176
+ return
177
+
178
+ self._job = self._job_api.get_job(job_id=self._job.id, teamspace_id=self._teamspace.id)
179
+
180
+ @property
181
+ def name(self) -> str:
182
+ """The job's name."""
183
+ return self._name
184
+
185
+ @property
186
+ def teamspace(self) -> "Teamspace":
187
+ """The teamspace the job is part of."""
188
+ return self._teamspace
lightning_sdk/plugin.py CHANGED
@@ -127,7 +127,15 @@ class JobsPlugin(_Plugin):
127
127
  cloud_compute: Optional[Machine] = None,
128
128
  interruptible: bool = False,
129
129
  ) -> Job:
130
- """Launches an asynchronous job."""
130
+ """Launches an asynchronous job.
131
+
132
+ Args:
133
+ command: The command to be executed.
134
+ name: The name of the job.
135
+ machine: The machine to run the job on.
136
+ interruptible: Whether to run the job on an interruptible machine.
137
+ These are cheaper but can be preempted at any time.
138
+ """
131
139
  if not name:
132
140
  name = _run_name("job")
133
141
 
@@ -139,7 +147,7 @@ class JobsPlugin(_Plugin):
139
147
  command=command,
140
148
  studio=self._studio,
141
149
  teamspace=self._studio.teamspace,
142
- cluster=self._studio._cluster,
150
+ cloud_account=self._studio.cloud_account,
143
151
  interruptible=interruptible,
144
152
  )
145
153
 
@@ -161,29 +169,39 @@ class MultiMachineTrainingPlugin(_Plugin):
161
169
  machine: Machine = Machine.CPU,
162
170
  cloud_compute: Optional[Machine] = None,
163
171
  num_instances: int = 2,
164
- strategy: str = "parallel",
165
172
  interruptible: bool = False,
166
173
  ) -> Job:
167
- """Launches an asynchronous multi-machine-training."""
174
+ """Launches an asynchronous multi-machine-training.
175
+
176
+ Args:
177
+ command: The command to be executed.
178
+ name: The name of the job.
179
+ machine: The machine to run the job on.
180
+ num_instances: The number of instances to run the job on.
181
+ interruptible: Whether to run the job on an interruptible machine.
182
+ These are cheaper but can be preempted at any time.
183
+ """
184
+ from lightning_sdk.mmt import MMT
185
+
168
186
  if not name:
169
187
  name = _run_name("dist-run")
170
188
 
171
189
  machine = _resolve_deprecated_cloud_compute(machine, cloud_compute)
172
190
 
173
- # TODO: assert num_instances >=2
174
- resp = self._studio._studio_api.create_multi_machine_job(
175
- entrypoint=command,
191
+ MMT._force_v1 = True
192
+
193
+ mmt = MMT.run(
176
194
  name=name,
177
- num_instances=num_instances,
195
+ num_machines=num_instances,
178
196
  machine=machine,
179
- strategy=strategy,
180
- studio_id=self._studio._studio.id,
181
- teamspace_id=self._studio._teamspace.id,
182
- cluster_id=self._studio._studio.cluster_id,
197
+ command=command,
198
+ studio=self._studio,
199
+ teamspace=self._studio.teamspace,
183
200
  interruptible=interruptible,
184
201
  )
185
202
 
186
- return Job(resp.name, self._studio.teamspace)
203
+ MMT._force_v1 = False
204
+ return mmt
187
205
 
188
206
 
189
207
  class MultiMachineDataPrepPlugin(_Plugin):
@@ -201,7 +219,16 @@ class MultiMachineDataPrepPlugin(_Plugin):
201
219
  num_instances: int = 2,
202
220
  interruptible: bool = False,
203
221
  ) -> Job:
204
- """Launches an asynchronous multi-machine-processing-job."""
222
+ """Launches an asynchronous multi-machine-data-processing job.
223
+
224
+ Args:
225
+ command: The command to be executed.
226
+ name: The name of the job.
227
+ machine: The machine to run the job on.
228
+ num_instances: The number of instances to run the job on.
229
+ interruptible: Whether to run the job on an interruptible machine.
230
+ These are cheaper but can be preempted at any time.
231
+ """
205
232
  if not name:
206
233
  name = _run_name("data-prep")
207
234
 
@@ -214,7 +241,7 @@ class MultiMachineDataPrepPlugin(_Plugin):
214
241
  machine=machine,
215
242
  studio_id=self._studio._studio.id,
216
243
  teamspace_id=self._studio._teamspace.id,
217
- cluster_id=self._studio._studio.cluster_id,
244
+ cloud_account=self._studio.cloud_account,
218
245
  interruptible=interruptible,
219
246
  )
220
247
 
@@ -261,7 +288,7 @@ class InferenceServerPlugin(_Plugin):
261
288
  endpoint=endpoint,
262
289
  studio_id=self._studio._studio.id,
263
290
  teamspace_id=self._studio._teamspace.id,
264
- cluster_id=self._studio._studio.cluster_id,
291
+ cloud_account=self._studio.cloud_account,
265
292
  interruptible=interruptible,
266
293
  )
267
294
 
@@ -11,6 +11,7 @@ from lightning_sdk.lightning_cloud.login import Auth
11
11
  from lightning_sdk.lightning_cloud.openapi import CommandArgumentCommandArgumentType
12
12
  from lightning_sdk.lightning_cloud.rest_client import LightningClient
13
13
  from lightning_sdk.services.utilities import _get_cluster, _get_project, _get_service_url
14
+ from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
14
15
 
15
16
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16
17
 
@@ -25,16 +26,19 @@ class Client:
25
26
  self,
26
27
  name: str,
27
28
  teamspace: Optional[str],
28
- cluster_id: Optional[str] = None,
29
+ cloud_account: Optional[str] = None,
30
+ cluster_id: Optional[str] = None, # deprecated in favor of cloud_account
29
31
  ) -> None:
30
32
  """Constructor of the Client.
31
33
 
32
34
  Args:
33
35
  name: The name of the Studio File Endpoint Service.
34
36
  teamspace: The name of the teamspace you want to attach the upload data and artifacts to be.
35
- cluster_id: The name of the cluster on which to upload the data.
37
+ cloud_account: The name of the cloud account on which to upload the data.
36
38
 
37
39
  """
40
+ cloud_account = _resolve_deprecated_cluster(cloud_account, cluster_id)
41
+
38
42
  self._auth = Auth()
39
43
 
40
44
  try:
@@ -46,7 +50,9 @@ class Client:
46
50
  self._teamspace = teamspace
47
51
  self._client = LightningClient()
48
52
  self._project = _get_project(client=self._client, project_name=teamspace)
49
- self._cluster = _get_cluster(client=self._client, project_id=self._project.project_id, cluster_id=cluster_id)
53
+ self._cloud_account = _get_cluster(
54
+ client=self._client, project_id=self._project.project_id, cluster_id=cloud_account
55
+ )
50
56
  self._file_endpoint = self._client.endpoint_service_get_file_endpoint_by_name(
51
57
  project_id=self._project.project_id, name=self._name
52
58
  )
@@ -101,7 +107,7 @@ class Client:
101
107
  _FileUploader(
102
108
  client=self._client,
103
109
  teamspace_id=self._project.project_id,
104
- cluster_id=self._cluster.cluster_id,
110
+ cloud_account=self._cloud_account.cluster_id,
105
111
  file_path=argument.value,
106
112
  progress_bar=True,
107
113
  remote_path=_sanitize_uploads_remote_path(argument.value),
@@ -109,7 +115,7 @@ class Client:
109
115
 
110
116
  json = {
111
117
  "teamspace_id": self._project.project_id,
112
- "cluster_id": self._cluster.cluster_id,
118
+ "cluster_id": self._cloud_account.cluster_id,
113
119
  "input": {},
114
120
  }
115
121
  for argument in self._arguments:
lightning_sdk/studio.py CHANGED
@@ -11,7 +11,7 @@ from lightning_sdk.owner import Owner
11
11
  from lightning_sdk.status import Status
12
12
  from lightning_sdk.teamspace import Teamspace
13
13
  from lightning_sdk.user import User
14
- from lightning_sdk.utils.resolve import _resolve_teamspace, _setup_logger
14
+ from lightning_sdk.utils.resolve import _resolve_deprecated_cluster, _resolve_teamspace, _setup_logger
15
15
 
16
16
  if TYPE_CHECKING:
17
17
  from lightning_sdk.plugin import Plugin
@@ -30,10 +30,9 @@ class Studio:
30
30
  teamspace: the name of the teamspace the studio is contained by
31
31
  org: the name of the organization owning the :param`teamspace` in case it is owned by an org
32
32
  user: the name of the user owning the :param`teamspace` in case it is owned directly by a user instead of an org
33
- cluster: the name of the cluster, the studio should be created on.
33
+ cloud_account: the name of the cloud account, the studio should be created on.
34
34
  Doesn't matter when the studio already exists.
35
35
  create_ok: whether the studio will be created if it does not yet exist. Defaults to True
36
-
37
36
  Note:
38
37
  Since a teamspace can either be owned by an org or by a user directly,
39
38
  only one of the arguments can be provided.
@@ -49,13 +48,14 @@ class Studio:
49
48
  teamspace: Optional[Union[str, Teamspace]] = None,
50
49
  org: Optional[Union[str, Organization]] = None,
51
50
  user: Optional[Union[str, User]] = None,
52
- cluster: Optional[str] = None,
51
+ cloud_account: Optional[str] = None,
53
52
  create_ok: bool = True,
53
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
54
54
  ) -> None:
55
55
  self._studio_api = StudioApi()
56
56
 
57
57
  self._teamspace = _resolve_teamspace(teamspace=teamspace, org=org, user=user)
58
- self._cluster = cluster
58
+ self._cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
59
59
  self._setup_done = False
60
60
 
61
61
  self._plugins = {}
@@ -70,7 +70,9 @@ class Studio:
70
70
  self._studio = self._studio_api.get_studio(name, self._teamspace.id)
71
71
  except ValueError as e:
72
72
  if create_ok:
73
- self._studio = self._studio_api.create_studio(name, self._teamspace.id, cluster=self._cluster)
73
+ self._studio = self._studio_api.create_studio(
74
+ name, self._teamspace.id, cloud_account=self._cloud_account
75
+ )
74
76
  else:
75
77
  raise ValueError(f"Studio {name} does not exist.") from e
76
78
 
@@ -144,6 +146,11 @@ class Studio:
144
146
  @property
145
147
  def cluster(self) -> str:
146
148
  """Returns the cluster the Studio is running on."""
149
+ warnings.warn("Studio.cluster is deprecated. Use Studio.cloud_account instead", DeprecationWarning)
150
+ return self.cloud_account
151
+
152
+ @property
153
+ def cloud_account(self) -> str:
147
154
  return self._studio.cluster_id
148
155
 
149
156
  def start(self, machine: Union[Machine, str] = Machine.CPU, interruptible: bool = False) -> None:
@@ -242,7 +249,7 @@ class Studio:
242
249
  self._studio_api.upload_file(
243
250
  studio_id=self._studio.id,
244
251
  teamspace_id=self._teamspace.id,
245
- cluster_id=self._studio.cluster_id,
252
+ cloud_account=self._studio.cluster_id,
246
253
  file_path=file_path,
247
254
  remote_path=os.path.normpath(remote_path),
248
255
  progress_bar=progress_bar,
@@ -258,7 +265,7 @@ class Studio:
258
265
  target_path=file_path,
259
266
  studio_id=self._studio.id,
260
267
  teamspace_id=self._teamspace.id,
261
- cluster_id=self._studio.cluster_id,
268
+ cloud_account=self._studio.cluster_id,
262
269
  )
263
270
 
264
271
  def download_folder(self, remote_path: str, target_path: Optional[str] = None) -> None:
@@ -271,7 +278,7 @@ class Studio:
271
278
  target_path=target_path,
272
279
  studio_id=self._studio.id,
273
280
  teamspace_id=self._teamspace.id,
274
- cluster_id=self._studio.cluster_id,
281
+ cloud_account=self._studio.cluster_id,
275
282
  )
276
283
 
277
284
  @property