lightning-sdk 0.1.40__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. lightning_sdk/__init__.py +1 -1
  2. lightning_sdk/ai_hub.py +8 -3
  3. lightning_sdk/api/ai_hub_api.py +3 -3
  4. lightning_sdk/api/deployment_api.py +6 -6
  5. lightning_sdk/api/job_api.py +32 -6
  6. lightning_sdk/api/mmt_api.py +60 -19
  7. lightning_sdk/api/studio_api.py +37 -19
  8. lightning_sdk/api/teamspace_api.py +34 -29
  9. lightning_sdk/api/utils.py +48 -35
  10. lightning_sdk/cli/ai_hub.py +3 -3
  11. lightning_sdk/cli/entrypoint.py +3 -1
  12. lightning_sdk/cli/mmt.py +11 -10
  13. lightning_sdk/cli/run.py +9 -8
  14. lightning_sdk/cli/serve.py +130 -0
  15. lightning_sdk/deployment/deployment.py +18 -12
  16. lightning_sdk/job/base.py +118 -24
  17. lightning_sdk/job/job.py +87 -9
  18. lightning_sdk/job/v1.py +75 -18
  19. lightning_sdk/job/v2.py +51 -15
  20. lightning_sdk/job/work.py +36 -7
  21. lightning_sdk/lightning_cloud/openapi/__init__.py +13 -0
  22. lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
  23. lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
  24. lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
  25. lightning_sdk/lightning_cloud/openapi/api/secret_service_api.py +5 -1
  26. lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
  27. lightning_sdk/lightning_cloud/openapi/models/__init__.py +13 -0
  28. lightning_sdk/lightning_cloud/openapi/models/create_deployment_request_defines_a_spec_for_the_job_that_allows_for_autoscaling_jobs.py +27 -1
  29. lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
  30. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
  31. lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
  32. lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
  33. lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
  34. lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
  35. lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
  36. lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
  37. lightning_sdk/lightning_cloud/openapi/models/v1_deployment_api.py +27 -1
  38. lightning_sdk/lightning_cloud/openapi/models/v1_deployment_spec.py +27 -1
  39. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
  40. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
  41. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
  42. lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
  43. lightning_sdk/lightning_cloud/openapi/models/v1_header.py +175 -0
  44. lightning_sdk/lightning_cloud/openapi/models/v1_job_spec.py +27 -1
  45. lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
  46. lightning_sdk/lightning_cloud/openapi/models/v1_managed_model.py +29 -3
  47. lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
  48. lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
  49. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
  50. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
  51. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
  52. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
  53. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
  54. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
  55. lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +2 -0
  56. lightning_sdk/lightning_cloud/openapi/models/v1_secret_type.py +1 -0
  57. lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
  58. lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
  59. lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
  60. lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
  61. lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +41 -67
  62. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
  63. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
  64. lightning_sdk/lightning_cloud/rest_client.py +2 -0
  65. lightning_sdk/mmt/__init__.py +3 -0
  66. lightning_sdk/{_mmt → mmt}/base.py +20 -14
  67. lightning_sdk/{_mmt → mmt}/mmt.py +46 -17
  68. lightning_sdk/mmt/v1.py +129 -0
  69. lightning_sdk/{_mmt → mmt}/v2.py +16 -21
  70. lightning_sdk/plugin.py +43 -16
  71. lightning_sdk/services/file_endpoint.py +11 -5
  72. lightning_sdk/studio.py +16 -9
  73. lightning_sdk/teamspace.py +26 -14
  74. lightning_sdk/utils/resolve.py +18 -0
  75. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/METADATA +3 -1
  76. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/RECORD +80 -66
  77. lightning_sdk/_mmt/__init__.py +0 -3
  78. lightning_sdk/_mmt/v1.py +0 -69
  79. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/LICENSE +0 -0
  80. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/WHEEL +0 -0
  81. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/entry_points.txt +0 -0
  82. {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/top_level.txt +0 -0
lightning_sdk/job/base.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import TYPE_CHECKING, Dict, Optional, Union
2
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Union
3
3
 
4
- from lightning_sdk.utils.resolve import _resolve_teamspace
4
+ from lightning_sdk.utils.resolve import _resolve_deprecated_cluster, _resolve_teamspace
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from lightning_sdk.machine import Machine
@@ -13,6 +13,8 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  class _BaseJob(ABC):
16
+ """Base interface to all job types."""
17
+
16
18
  def __init__(
17
19
  self,
18
20
  name: str,
@@ -22,6 +24,15 @@ class _BaseJob(ABC):
22
24
  *,
23
25
  _fetch_job: bool = True,
24
26
  ) -> None:
27
+ """Fetch already existing jobs.
28
+
29
+ Args:
30
+ name: the name of the job
31
+ teamspace: the teamspace the job is part of
32
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
33
+ user: the name of the user owning the :param`teamspace`
34
+ in case it is owned directly by a user instead of an org.
35
+ """
25
36
  _teamspace = _resolve_teamspace(teamspace=teamspace, org=org, user=user)
26
37
  if _teamspace is None:
27
38
  raise ValueError(
@@ -47,22 +58,59 @@ class _BaseJob(ABC):
47
58
  teamspace: Union[str, "Teamspace", None] = None,
48
59
  org: Union[str, "Organization", None] = None,
49
60
  user: Union[str, "User", None] = None,
50
- cluster: Optional[str] = None,
61
+ cloud_account: Optional[str] = None,
51
62
  env: Optional[Dict[str, str]] = None,
52
63
  interruptible: bool = False,
53
64
  image_credentials: Optional[str] = None,
54
- cluster_auth: bool = False,
65
+ cloud_account_auth: bool = False,
55
66
  artifacts_local: Optional[str] = None,
56
67
  artifacts_remote: Optional[str] = None,
68
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
57
69
  ) -> "_BaseJob":
70
+ """Run async workloads using a docker image or a compute environment from your studio.
71
+
72
+ Args:
73
+ name: The name of the job. Needs to be unique within the teamspace.
74
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
75
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
76
+ If not provided for images, will run the container entrypoint and default command.
77
+ studio: The studio env to run the job with. Mutually exclusive with image.
78
+ image: The docker image to run the job with. Mutually exclusive with studio.
79
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
80
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
81
+ user: The user owning the teamspace (if any). Defaults to the current user.
82
+ cloud_account: The cloud account to run the job on.
83
+ Defaults to the studio cloud account if running with studio compute env.
84
+ If not provided will fall back to the teamspaces default cloud account.
85
+ env: Environment variables to set inside the job.
86
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
87
+ image_credentials: The credentials used to pull the image. Required if the image is private.
88
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
89
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
90
+ Required if the registry is part of a cloud provider (e.g. ECR).
91
+ artifacts_local: The path of inside the docker container, you want to persist images from.
92
+ CAUTION: When setting this to "/", it will effectively erase your container.
93
+ Only supported for jobs with a docker image compute environment.
94
+ artifacts_remote: The remote storage to persist your artifacts to.
95
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
96
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
97
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
98
+ within it.
99
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
100
+ Only supported for jobs with a docker image compute environment.
101
+ """
58
102
  from lightning_sdk.studio import Studio
59
103
 
104
+ cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
105
+
60
106
  if not name:
61
107
  raise ValueError("A job needs to have a name!")
62
108
 
63
109
  if image is None:
64
110
  if not isinstance(studio, Studio):
65
- studio = Studio(name=studio, teamspace=teamspace, org=org, user=user, cluster=cluster, create_ok=False)
111
+ studio = Studio(
112
+ name=studio, teamspace=teamspace, org=org, user=user, cloud_account=cloud_account, create_ok=False
113
+ )
66
114
 
67
115
  # studio is a Studio instance at this point
68
116
  if teamspace is None:
@@ -76,20 +124,20 @@ class _BaseJob(ABC):
76
124
  "Can only run jobs with Studio envs in the teamspace of that Studio."
77
125
  )
78
126
 
79
- if cluster is None:
80
- cluster = studio.cluster
127
+ if cloud_account is None:
128
+ cloud_account = studio.cloud_account
81
129
 
82
- if cluster != studio.cluster:
130
+ if cloud_account != studio.cloud_account:
83
131
  raise ValueError(
84
- "Studio cluster does not match provided cluster. "
85
- "Can only run jobs with Studio envs in the same cluster."
132
+ "Studio cloud account does not match provided cloud account. "
133
+ "Can only run jobs with Studio envs in the same cloud account."
86
134
  )
87
135
 
88
136
  if image_credentials is not None:
89
137
  raise ValueError("image_credentials is only supported when using a custom image")
90
138
 
91
- if cluster_auth:
92
- raise ValueError("cluster_auth is only supported when using a custom image")
139
+ if cloud_account_auth:
140
+ raise ValueError("cloud_account_auth is only supported when using a custom image")
93
141
 
94
142
  if artifacts_local is not None or artifacts_remote is not None:
95
143
  raise ValueError(
@@ -116,14 +164,14 @@ class _BaseJob(ABC):
116
164
  inst = cls(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=False)
117
165
  return inst._submit(
118
166
  machine=machine,
119
- cluster=cluster,
167
+ cloud_account=cloud_account,
120
168
  command=command,
121
169
  studio=studio,
122
170
  image=image,
123
171
  env=env,
124
172
  interruptible=interruptible,
125
173
  image_credentials=image_credentials,
126
- cluster_auth=cluster_auth,
174
+ cloud_account_auth=cloud_account_auth,
127
175
  artifacts_local=artifacts_local,
128
176
  artifacts_remote=artifacts_remote,
129
177
  )
@@ -137,46 +185,79 @@ class _BaseJob(ABC):
137
185
  image: Optional[str] = None,
138
186
  env: Optional[Dict[str, str]] = None,
139
187
  interruptible: bool = False,
140
- cluster: Optional[str] = None,
188
+ cloud_account: Optional[str] = None,
141
189
  image_credentials: Optional[str] = None,
142
- cluster_auth: bool = False,
190
+ cloud_account_auth: bool = False,
143
191
  artifacts_local: Optional[str] = None,
144
192
  artifacts_remote: Optional[str] = None,
145
193
  ) -> "_BaseJob":
146
- """Submits a job and updates the internal _job attribute as well as the _name attribute."""
194
+ """Submit a new job to the Lightning AI platform.
195
+
196
+ Args:
197
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
198
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
199
+ If not provided for images, will run the container entrypoint and default command.
200
+ studio: The studio env to run the job with. Mutually exclusive with image.
201
+ image: The docker image to run the job with. Mutually exclusive with studio.
202
+ env: Environment variables to set inside the job.
203
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
204
+ cloud_account: The cloud account to run the job on.
205
+ Defaults to the studio cloud account if running with studio compute env.
206
+ If not provided will fall back to the teamspaces default cloud account.
207
+ image_credentials: The credentials used to pull the image. Required if the image is private.
208
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
209
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
210
+ Required if the registry is part of a cloud provider (e.g. ECR).
211
+ artifacts_local: The path of inside the docker container, you want to persist images from.
212
+ CAUTION: When setting this to "/", it will effectively erase your container.
213
+ Only supported for jobs with a docker image compute environment.
214
+ artifacts_remote: The remote storage to persist your artifacts to.
215
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
216
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
217
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
218
+ within it.
219
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
220
+ Only supported for jobs with a docker image compute environment.
221
+ """
147
222
 
148
223
  @abstractmethod
149
224
  def stop(self) -> None:
150
- pass
225
+ """Stops the job.
226
+
227
+ This is blocking until the job is stopped.
228
+ """
151
229
 
152
230
  @abstractmethod
153
231
  def delete(self) -> None:
154
- pass
232
+ """Deletes the job.
233
+
234
+ Caution: This also deletes all artifacts and snapshots associated with the job.
235
+ """
155
236
 
156
237
  @property
157
238
  @abstractmethod
158
239
  def status(self) -> "Status":
159
- pass
240
+ """The current status of the job."""
160
241
 
161
242
  @property
162
243
  @abstractmethod
163
244
  def machine(self) -> "Machine":
164
- pass
245
+ """The machine type the job is running on."""
165
246
 
166
247
  @property
167
248
  @abstractmethod
168
249
  def artifact_path(self) -> Optional[str]:
169
- pass
250
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
170
251
 
171
252
  @property
172
253
  @abstractmethod
173
254
  def snapshot_path(self) -> Optional[str]:
174
- pass
255
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
175
256
 
176
257
  @property
177
258
  @abstractmethod
178
259
  def share_path(self) -> Optional[str]:
179
- pass
260
+ """Path to the jobs share path."""
180
261
 
181
262
  @abstractmethod
182
263
  def _update_internal_job(self) -> None:
@@ -184,8 +265,21 @@ class _BaseJob(ABC):
184
265
 
185
266
  @property
186
267
  def name(self) -> str:
268
+ """The job's name."""
187
269
  return self._name
188
270
 
189
271
  @property
190
272
  def teamspace(self) -> "Teamspace":
273
+ """The teamspace the job is part of."""
191
274
  return self._teamspace
275
+
276
+ @property
277
+ def _guaranteed_job(self) -> Any:
278
+ """Guarantees that the job was fetched at some point before returning it.
279
+
280
+ Doesn't guarantee to have the lastest version of the job. Use _latest_job for that.
281
+ """
282
+ if getattr(self, "_job", None) is None:
283
+ self._update_internal_job()
284
+
285
+ return self._job
lightning_sdk/job/job.py CHANGED
@@ -55,14 +55,47 @@ class Job(_BaseJob):
55
55
  teamspace: Union[str, "Teamspace", None] = None,
56
56
  org: Union[str, "Organization", None] = None,
57
57
  user: Union[str, "User", None] = None,
58
- cluster: Optional[str] = None,
58
+ cloud_account: Optional[str] = None,
59
59
  env: Optional[Dict[str, str]] = None,
60
60
  interruptible: bool = False,
61
61
  image_credentials: Optional[str] = None,
62
- cluster_auth: bool = False,
62
+ cloud_account_auth: bool = False,
63
63
  artifacts_local: Optional[str] = None,
64
64
  artifacts_remote: Optional[str] = None,
65
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
65
66
  ) -> "Job":
67
+ """Run async workloads using a docker image or a compute environment from your studio.
68
+
69
+ Args:
70
+ name: The name of the job. Needs to be unique within the teamspace.
71
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
72
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
73
+ If not provided for images, will run the container entrypoint and default command.
74
+ studio: The studio env to run the job with. Mutually exclusive with image.
75
+ image: The docker image to run the job with. Mutually exclusive with studio.
76
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
77
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
78
+ user: The user owning the teamspace (if any). Defaults to the current user.
79
+ cloud_account: The cloud acocunt to run the job on.
80
+ Defaults to the studio cloud account if running with studio compute env.
81
+ If not provided will fall back to the teamspaces default cloud account.
82
+ env: Environment variables to set inside the job.
83
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
84
+ image_credentials: The credentials used to pull the image. Required if the image is private.
85
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
86
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
87
+ Required if the registry is part of a cloud provider (e.g. ECR).
88
+ artifacts_local: The path of inside the docker container, you want to persist images from.
89
+ CAUTION: When setting this to "/", it will effectively erase your container.
90
+ Only supported for jobs with a docker image compute environment.
91
+ artifacts_remote: The remote storage to persist your artifacts to.
92
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
93
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
94
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
95
+ within it.
96
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
97
+ Only supported for jobs with a docker image compute environment.
98
+ """
66
99
  ret_val = super().run(
67
100
  name=name,
68
101
  machine=machine,
@@ -72,13 +105,14 @@ class Job(_BaseJob):
72
105
  teamspace=teamspace,
73
106
  org=org,
74
107
  user=user,
75
- cluster=cluster,
108
+ cloud_account=cloud_account,
76
109
  env=env,
77
110
  interruptible=interruptible,
78
111
  image_credentials=image_credentials,
79
- cluster_auth=cluster_auth,
112
+ cloud_account_auth=cloud_account_auth,
80
113
  artifacts_local=artifacts_local,
81
114
  artifacts_remote=artifacts_remote,
115
+ cluster=cluster,
82
116
  )
83
117
  # required for typing with "Job"
84
118
  assert isinstance(ret_val, cls)
@@ -92,51 +126,92 @@ class Job(_BaseJob):
92
126
  image: Optional[str] = None,
93
127
  env: Optional[Dict[str, str]] = None,
94
128
  interruptible: bool = False,
95
- cluster: Optional[str] = None,
129
+ cloud_account: Optional[str] = None,
96
130
  image_credentials: Optional[str] = None,
97
- cluster_auth: bool = False,
131
+ cloud_account_auth: bool = False,
98
132
  artifacts_local: Optional[str] = None,
99
133
  artifacts_remote: Optional[str] = None,
100
- ) -> None:
134
+ ) -> "Job":
135
+ """Submit a new job to the Lightning AI platform.
136
+
137
+ Args:
138
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
139
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
140
+ If not provided for images, will run the container entrypoint and default command.
141
+ studio: The studio env to run the job with. Mutually exclusive with image.
142
+ image: The docker image to run the job with. Mutually exclusive with studio.
143
+ env: Environment variables to set inside the job.
144
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
145
+ cloud_account: The cloud account to run the job on.
146
+ Defaults to the studio cloud account if running with studio compute env.
147
+ If not provided will fall back to the teamspaces default cloud account.
148
+ image_credentials: The credentials used to pull the image. Required if the image is private.
149
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
150
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
151
+ Required if the registry is part of a cloud provider (e.g. ECR).
152
+ artifacts_local: The path of inside the docker container, you want to persist images from.
153
+ CAUTION: When setting this to "/", it will effectively erase your container.
154
+ Only supported for jobs with a docker image compute environment.
155
+ artifacts_remote: The remote storage to persist your artifacts to.
156
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
157
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
158
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
159
+ within it.
160
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
161
+ Only supported for jobs with a docker image compute environment.
162
+ """
101
163
  self._job = self._internal_job._submit(
102
164
  machine=machine,
103
- cluster=cluster,
165
+ cloud_account=cloud_account,
104
166
  command=command,
105
167
  studio=studio,
106
168
  image=image,
107
169
  env=env,
108
170
  interruptible=interruptible,
109
171
  image_credentials=image_credentials,
110
- cluster_auth=cluster_auth,
172
+ cloud_account_auth=cloud_account_auth,
111
173
  artifacts_local=artifacts_local,
112
174
  artifacts_remote=artifacts_remote,
113
175
  )
114
176
  return self
115
177
 
116
178
  def stop(self) -> None:
179
+ """Stops the job.
180
+
181
+ This is blocking until the job is stopped.
182
+ """
117
183
  return self._internal_job.stop()
118
184
 
119
185
  def delete(self) -> None:
186
+ """Deletes the job.
187
+
188
+ Caution: This also deletes all artifacts and snapshots associated with the job.
189
+ """
120
190
  return self._internal_job.delete()
121
191
 
122
192
  @property
123
193
  def status(self) -> "Status":
194
+ """The current status of the job."""
124
195
  return self._internal_job.status
125
196
 
126
197
  @property
127
198
  def machine(self) -> "Machine":
199
+ """The machine type the job is running on."""
128
200
  return self._internal_job.machine
129
201
 
130
202
  @property
131
203
  def artifact_path(self) -> Optional[str]:
204
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
132
205
  return self._internal_job.artifact_path
133
206
 
134
207
  @property
135
208
  def snapshot_path(self) -> Optional[str]:
209
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
136
210
  return self._internal_job.snapshot_path
137
211
 
138
212
  @property
139
213
  def share_path(self) -> Optional[str]:
214
+ """Path to the jobs share path."""
140
215
  return self._internal_job.share_path
141
216
 
142
217
  def _update_internal_job(self) -> None:
@@ -144,14 +219,17 @@ class Job(_BaseJob):
144
219
 
145
220
  @property
146
221
  def name(self) -> str:
222
+ """The job's name."""
147
223
  return self._internal_job.name
148
224
 
149
225
  @property
150
226
  def teamspace(self) -> "Teamspace":
227
+ """The teamspace the job is part of."""
151
228
  return self._internal_job._teamspace
152
229
 
153
230
  @property
154
231
  def cluster(self) -> Optional[str]:
232
+ """The cluster the job is running on."""
155
233
  return self._internal_job.cluster
156
234
 
157
235
  def __getattr__(self, key: str) -> Any:
lightning_sdk/job/v1.py CHANGED
@@ -17,6 +17,8 @@ from lightning_sdk.job.work import Work
17
17
 
18
18
 
19
19
  class _JobV1(_BaseJob):
20
+ """Implementation to run async workloads from your Studio."""
21
+
20
22
  def __init__(
21
23
  self,
22
24
  name: str,
@@ -26,6 +28,15 @@ class _JobV1(_BaseJob):
26
28
  *,
27
29
  _fetch_job: bool = True,
28
30
  ) -> None:
31
+ """Fetch already existing jobs.
32
+
33
+ Args:
34
+ name: the name of the job
35
+ teamspace: the teamspace the job is part of
36
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
37
+ user: the name of the user owning the :param`teamspace`
38
+ in case it is owned directly by a user instead of an org
39
+ """
29
40
  self._job_api = JobApiV1()
30
41
  super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
31
42
 
@@ -39,9 +50,26 @@ class _JobV1(_BaseJob):
39
50
  teamspace: Union[str, "Teamspace", None] = None,
40
51
  org: Union[str, "Organization", None] = None,
41
52
  user: Union[str, "User", None] = None,
42
- cluster: Optional[str] = None,
53
+ cloud_account: Optional[str] = None,
43
54
  interruptible: bool = False,
55
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
44
56
  ) -> "_BaseJob":
57
+ """Start a new async workload from your studio.
58
+
59
+ Args:
60
+ name: the name of the job
61
+ machine: the machine to run the workload on
62
+ command: the command to execute
63
+ studio: the studio the job belongs to
64
+ teamspace: the teamspace the job is part of
65
+ org: the organization owning the teamspace (if applicable)
66
+ user: the user owning the teamspace (if applicable)
67
+ cloud_account: the cloud account to run the workload on
68
+ interruptible: whether the workload can be interrupted
69
+
70
+ Returns:
71
+ the created job
72
+ """
45
73
  return super().run(
46
74
  name=name,
47
75
  machine=machine,
@@ -51,11 +79,12 @@ class _JobV1(_BaseJob):
51
79
  teamspace=teamspace,
52
80
  org=org,
53
81
  user=user,
54
- cluster=cluster,
82
+ cloud_account=cloud_account,
55
83
  env=None,
56
84
  interruptible=interruptible,
57
85
  image_credentials=None,
58
- cluster_auth=False,
86
+ cloud_account_auth=False,
87
+ cluster=cluster,
59
88
  )
60
89
 
61
90
  def _submit(
@@ -66,16 +95,34 @@ class _JobV1(_BaseJob):
66
95
  image: Optional[str] = None,
67
96
  env: Optional[Dict[str, str]] = None,
68
97
  interruptible: bool = False,
69
- cluster: Optional[str] = None,
98
+ cloud_account: Optional[str] = None,
70
99
  image_credentials: Optional[str] = None,
71
- cluster_auth: bool = False,
100
+ cloud_account_auth: bool = False,
72
101
  artifacts_local: Optional[str] = None,
73
102
  artifacts_remote: Optional[str] = None,
74
103
  ) -> "_JobV1":
104
+ """Submit a job to run on a machine.
105
+
106
+ Args:
107
+ machine: The machine to run the job on.
108
+ command: The command to execute.
109
+ studio: The studio the job belongs to.
110
+ image: The image to use for the job (not supported).
111
+ env: The environment variables for the job (not supported).
112
+ interruptible: Whether the job can be interrupted.
113
+ cloud_account: The cloud account to run the job on.
114
+ image_credentials: The image credentials for the job (not supported).
115
+ cloud_account_auth: Whether to use cloud account authentication for the job (not supported).
116
+ artifacts_local: The local path for persisting artifacts (not supported).
117
+ artifacts_remote: The remote path for persisting artifacts (not supported).
118
+
119
+ Returns:
120
+ The submitted job.
121
+
122
+ """
75
123
  if studio is None:
76
124
  raise ValueError("Studio is required for submitting jobs")
77
-
78
- if image is not None or image_credentials is not None or cluster_auth:
125
+ if image is not None or image_credentials is not None or cloud_account_auth:
79
126
  raise ValueError("Image is not supported for submitting jobs")
80
127
 
81
128
  if artifacts_local is not None or artifacts_remote is not None:
@@ -83,18 +130,15 @@ class _JobV1(_BaseJob):
83
130
 
84
131
  if env is not None:
85
132
  raise ValueError("Environment variables are not supported for submitting jobs")
86
-
87
133
  if command is None:
88
134
  raise ValueError("Command is required for submitting jobs")
89
-
90
135
  # TODO: add support for empty names (will give an empty string)
91
-
92
136
  _submitted = self._job_api.submit_job(
93
137
  name=self._name,
94
138
  command=command,
95
139
  studio_id=studio._studio.id,
96
140
  teamspace_id=self._teamspace.id,
97
- cluster_id=cluster,
141
+ cloud_account=cloud_account or "",
98
142
  machine=machine,
99
143
  interruptible=interruptible,
100
144
  )
@@ -110,6 +154,7 @@ class _JobV1(_BaseJob):
110
154
 
111
155
  @property
112
156
  def status(self) -> "Status":
157
+ """Returns the status of the job."""
113
158
  try:
114
159
  status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
115
160
  return _internal_status_to_external_status(status)
@@ -119,19 +164,22 @@ class _JobV1(_BaseJob):
119
164
  ) from None
120
165
 
121
166
  def stop(self) -> None:
167
+ """Stops the job. is blocking until the ob is stopped."""
122
168
  if self.status in (Status.Stopped, Status.Failed):
123
169
  return None
124
170
 
125
171
  return self._job_api.stop_job(self._job.id, self.teamspace.id)
126
172
 
127
173
  def delete(self) -> None:
128
- self._job_api.delete_job(self._job.id, self.teamspace.id)
174
+ """Deletes the job.
129
175
 
130
- def _name_filter(self, orig_name: str) -> str:
131
- return orig_name.replace("root.", "")
176
+ Caution: this also deletes all artifacts created by the job.
177
+ """
178
+ self._job_api.delete_job(self._job.id, self.teamspace.id)
132
179
 
133
180
  @cached_property
134
181
  def work(self) -> Work:
182
+ """Get the work associated with the job."""
135
183
  _work = self._job_api.list_works(self._job.id, self.teamspace.id)
136
184
  if len(_work) == 0:
137
185
  raise ValueError("No works found for job")
@@ -139,28 +187,37 @@ class _JobV1(_BaseJob):
139
187
 
140
188
  @property
141
189
  def machine(self) -> "Machine":
190
+ """Get the machine the job is running on."""
142
191
  return self.work.machine
143
192
 
144
- @property
145
- def id(self) -> str:
146
- return self._job.id
147
-
148
193
  @property
149
194
  def name(self) -> str:
195
+ """The name of the job."""
150
196
  return self._job.name
151
197
 
152
198
  @property
153
199
  def artifact_path(self) -> Optional[str]:
200
+ """The path to the artifacts of the job in the distributed teamspace filesystem."""
154
201
  return self.work.artifact_path
155
202
 
156
203
  @property
157
204
  def snapshot_path(self) -> Optional[str]:
205
+ """The path to the snapshot of the job in the distributed teamspace filesystem."""
158
206
  return f"/teamspace/jobs/{self.name}/snapshot"
159
207
 
160
208
  @property
161
209
  def share_path(self) -> Optional[str]:
210
+ """The path to the share of the job in the distributed teamspace filesystem."""
162
211
  return f"/teamspace/jobs/{self.name}/share"
163
212
 
213
+ # the following and functions are solely to make the Work class function
214
+ @property
215
+ def _id(self) -> str:
216
+ return self._guaranteed_job.id
217
+
218
+ def _name_filter(self, name: str) -> str:
219
+ return name.replace("root.", "")
220
+
164
221
 
165
222
  def _internal_status_to_external_status(internal_status: str) -> "Status":
166
223
  """Converts internal status strings from HTTP requests to external enums."""