lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. lightning_sdk/__init__.py +1 -1
  2. lightning_sdk/ai_hub.py +8 -3
  3. lightning_sdk/api/ai_hub_api.py +3 -3
  4. lightning_sdk/api/deployment_api.py +6 -6
  5. lightning_sdk/api/job_api.py +32 -6
  6. lightning_sdk/api/mmt_api.py +59 -19
  7. lightning_sdk/api/studio_api.py +37 -19
  8. lightning_sdk/api/teamspace_api.py +34 -29
  9. lightning_sdk/api/utils.py +46 -34
  10. lightning_sdk/cli/ai_hub.py +3 -3
  11. lightning_sdk/cli/entrypoint.py +3 -1
  12. lightning_sdk/cli/run.py +122 -12
  13. lightning_sdk/cli/serve.py +218 -0
  14. lightning_sdk/deployment/deployment.py +18 -12
  15. lightning_sdk/job/base.py +118 -24
  16. lightning_sdk/job/job.py +98 -9
  17. lightning_sdk/job/v1.py +75 -18
  18. lightning_sdk/job/v2.py +51 -15
  19. lightning_sdk/job/work.py +36 -7
  20. lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
  21. lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
  22. lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
  23. lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
  24. lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
  25. lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
  26. lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
  27. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
  28. lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
  29. lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
  30. lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
  31. lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
  32. lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
  33. lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
  34. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
  35. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
  36. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
  37. lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
  38. lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
  39. lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
  40. lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
  41. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
  42. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
  43. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
  44. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
  45. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
  46. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
  47. lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
  48. lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
  49. lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
  50. lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
  51. lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
  52. lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
  53. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
  54. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
  55. lightning_sdk/lightning_cloud/rest_client.py +2 -0
  56. lightning_sdk/mmt/__init__.py +4 -0
  57. lightning_sdk/mmt/base.py +278 -0
  58. lightning_sdk/mmt/mmt.py +267 -0
  59. lightning_sdk/mmt/v1.py +181 -0
  60. lightning_sdk/mmt/v2.py +188 -0
  61. lightning_sdk/plugin.py +43 -16
  62. lightning_sdk/services/file_endpoint.py +11 -5
  63. lightning_sdk/studio.py +16 -9
  64. lightning_sdk/teamspace.py +21 -8
  65. lightning_sdk/utils/resolve.py +18 -0
  66. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
  67. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
  68. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
  69. lightning_sdk/_mmt/__init__.py +0 -3
  70. lightning_sdk/_mmt/base.py +0 -180
  71. lightning_sdk/_mmt/mmt.py +0 -161
  72. lightning_sdk/_mmt/v1.py +0 -69
  73. lightning_sdk/_mmt/v2.py +0 -141
  74. lightning_sdk/cli/mmt.py +0 -137
  75. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
  76. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
  77. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import concurrent.futures
1
2
  import errno
2
3
  import math
3
4
  import os
@@ -8,7 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
8
9
 
9
10
  import backoff
10
11
  import requests
11
- from tqdm import tqdm
12
+ from tqdm.auto import tqdm
12
13
 
13
14
  from lightning_sdk.constants import __GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__, _LIGHTNING_DEBUG
14
15
  from lightning_sdk.lightning_cloud.openapi import (
@@ -66,14 +67,14 @@ class _FileUploader:
66
67
  self,
67
68
  client: LightningClient,
68
69
  teamspace_id: str,
69
- cluster_id: str,
70
+ cloud_account: str,
70
71
  file_path: str,
71
72
  remote_path: str,
72
73
  progress_bar: bool,
73
74
  ) -> None:
74
75
  self.client = client
75
76
  self.teamspace_id = teamspace_id
76
- self.cluster_id = cluster_id
77
+ self.cloud_account = cloud_account
77
78
 
78
79
  self.local_path = file_path
79
80
 
@@ -107,7 +108,7 @@ class _FileUploader:
107
108
 
108
109
  def _multipart_upload(self, count: int) -> None:
109
110
  """Does a parallel multipart upload."""
110
- body = ProjectIdStorageBody(cluster_id=self.cluster_id, filename=self.remote_path)
111
+ body = ProjectIdStorageBody(cluster_id=self.cloud_account, filename=self.remote_path)
111
112
  resp: V1UploadProjectArtifactResponse = self.client.storage_service_upload_project_artifact(
112
113
  body=body, project_id=self.teamspace_id
113
114
  )
@@ -123,7 +124,7 @@ class _FileUploader:
123
124
  completed.extend(self._process_upload_batch(executor=p, batch=batch, upload_id=resp.upload_id))
124
125
 
125
126
  completed_body = StorageCompleteBody(
126
- cluster_id=self.cluster_id, filename=self.remote_path, parts=completed, upload_id=resp.upload_id
127
+ cluster_id=self.cloud_account, filename=self.remote_path, parts=completed, upload_id=resp.upload_id
127
128
  )
128
129
  self.client.storage_service_complete_upload_project_artifact(body=completed_body, project_id=self.teamspace_id)
129
130
 
@@ -135,7 +136,7 @@ class _FileUploader:
135
136
 
136
137
  def _request_urls(self, parts: List[int], upload_id: str) -> List[V1PresignedUrl]:
137
138
  """Requests urls for a batch of parts."""
138
- body = UploadsUploadIdBody(cluster_id=self.cluster_id, filename=self.remote_path, parts=parts)
139
+ body = UploadsUploadIdBody(cluster_id=self.cloud_account, filename=self.remote_path, parts=parts)
139
140
  resp: V1UploadProjectArtifactPartsResponse = self.client.storage_service_upload_project_artifact_parts(
140
141
  body, self.teamspace_id, upload_id
141
142
  )
@@ -192,7 +193,7 @@ class _ModelFileUploader:
192
193
  model_id: str,
193
194
  version: str,
194
195
  teamspace_id: str,
195
- cluster_id: str,
196
+ cloud_account: str,
196
197
  file_path: str,
197
198
  remote_path: str,
198
199
  progress_bar: bool,
@@ -201,7 +202,6 @@ class _ModelFileUploader:
201
202
  self.model_id = model_id
202
203
  self.version = version
203
204
  self.teamspace_id = teamspace_id
204
- self.cluster_id = cluster_id
205
205
  self.local_path = file_path
206
206
  self.remote_path = remote_path
207
207
 
@@ -215,6 +215,8 @@ class _ModelFileUploader:
215
215
  unit="B",
216
216
  unit_scale=True,
217
217
  unit_divisor=1000,
218
+ position=1,
219
+ leave=False,
218
220
  )
219
221
  else:
220
222
  self.progress_bar = None
@@ -376,6 +378,7 @@ class _FileDownloader:
376
378
  teamspace_id: str,
377
379
  remote_path: str,
378
380
  file_path: str,
381
+ executor: ThreadPoolExecutor,
379
382
  num_workers: int = 20,
380
383
  progress_bar: Optional[tqdm] = None,
381
384
  ) -> None:
@@ -389,7 +392,7 @@ class _FileDownloader:
389
392
  self.num_workers = num_workers
390
393
  self._url = ""
391
394
  self._size = 0
392
- self.refresh()
395
+ self.executor = executor
393
396
 
394
397
  @backoff.on_exception(backoff.expo, ApiException, max_tries=10)
395
398
  def refresh(self) -> None:
@@ -445,26 +448,26 @@ class _FileDownloader:
445
448
  if remaining_size > 0:
446
449
  f.write(b"\x00" * remaining_size)
447
450
 
448
- def _multipart_download(self, filename: str, max_workers: int) -> None:
449
- num_chunks = max_workers
451
+ def _multipart_download(self, filename: str, num_workers: int) -> None:
452
+ num_chunks = num_workers
450
453
  chunk_size = math.ceil(self.size / num_chunks)
451
454
 
452
455
  if chunk_size < _DOWNLOAD_MIN_CHUNK_SIZE:
453
456
  num_chunks = math.ceil(self.size / _DOWNLOAD_MIN_CHUNK_SIZE)
454
457
  chunk_size = _DOWNLOAD_MIN_CHUNK_SIZE
455
458
 
456
- num_workers = min(max_workers, num_chunks)
457
-
458
459
  ranges = []
459
460
  for part_number in range(num_chunks):
460
461
  start = part_number * chunk_size
461
462
  end = min(start + chunk_size - 1, self.size - 1)
462
463
  ranges.append((start, end))
463
464
 
464
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
465
- executor.map(partial(self._download_chunk, filename), ranges)
465
+ futures = [self.executor.submit(self._download_chunk, filename, r) for r in ranges]
466
+ concurrent.futures.wait(futures)
466
467
 
467
468
  def download(self) -> None:
469
+ self.refresh()
470
+
468
471
  tmp_filename = f"{self.local_path}.download"
469
472
 
470
473
  try:
@@ -536,31 +539,40 @@ def _download_model_files(
536
539
  unit_divisor=1000,
537
540
  )
538
541
 
539
- for filepath in response.filepaths:
540
- local_file = download_dir / filepath
541
- local_file.parent.mkdir(parents=True, exist_ok=True)
542
-
543
- file_downloader = _FileDownloader(
544
- client=client,
545
- model_id=response.model_id,
546
- version=response.version,
547
- teamspace_id=response.project_id,
548
- remote_path=filepath,
549
- file_path=str(local_file),
550
- num_workers=num_workers,
551
- progress_bar=pbar,
552
- )
542
+ with ThreadPoolExecutor(max_workers=min(num_workers, len(response.filepaths))) as file_executor, ThreadPoolExecutor(
543
+ max_workers=num_workers
544
+ ) as part_executor:
545
+ futures = []
546
+
547
+ for filepath in response.filepaths:
548
+ local_file = download_dir / filepath
549
+ local_file.parent.mkdir(parents=True, exist_ok=True)
550
+
551
+ file_downloader = _FileDownloader(
552
+ client=client,
553
+ model_id=response.model_id,
554
+ version=response.version,
555
+ teamspace_id=response.project_id,
556
+ remote_path=filepath,
557
+ file_path=str(local_file),
558
+ num_workers=num_workers,
559
+ progress_bar=pbar,
560
+ executor=part_executor,
561
+ )
562
+
563
+ futures.append(file_executor.submit(file_downloader.download))
553
564
 
554
- file_downloader.download()
565
+ # wait for all threads
566
+ concurrent.futures.wait(futures)
555
567
 
556
- return response.filepaths
568
+ return response.filepaths
557
569
 
558
570
 
559
571
  def _create_app(
560
572
  client: CloudSpaceServiceApi,
561
573
  studio_id: str,
562
574
  teamspace_id: str,
563
- cluster_id: str,
575
+ cloud_account: str,
564
576
  plugin_type: str,
565
577
  **other_arguments: Any,
566
578
  ) -> Externalv1LightningappInstance:
@@ -573,7 +585,7 @@ def _create_app(
573
585
  del other_arguments["interruptible"]
574
586
 
575
587
  body = AppsIdBody(
576
- cluster_id=cluster_id,
588
+ cluster_id=cloud_account,
577
589
  plugin_arguments=other_arguments,
578
590
  service_id=os.getenv(_LIGHTNING_SERVICE_EXECUTION_ID_KEY),
579
591
  unique_id=__GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__[studio_id],
@@ -584,6 +596,6 @@ def _create_app(
584
596
  ).lightningappinstance
585
597
 
586
598
  if _LIGHTNING_DEBUG:
587
- print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {cluster_id=}")
599
+ print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {cloud_account=}")
588
600
 
589
601
  return resp
@@ -32,7 +32,7 @@ class _AIHub(_StudiosMenu):
32
32
  def deploy(
33
33
  self,
34
34
  api_id: str,
35
- cluster: Optional[str] = None,
35
+ cloud_account: Optional[str] = None,
36
36
  name: Optional[str] = None,
37
37
  teamspace: Optional[str] = None,
38
38
  org: Optional[str] = None,
@@ -41,9 +41,9 @@ class _AIHub(_StudiosMenu):
41
41
 
42
42
  Args:
43
43
  api_id: API template ID.
44
- cluster: Cluster to deploy the API to. Defaults to user's default cluster.
44
+ cloud_account: Cloud Account to deploy the API to. Defaults to user's default cloud account.
45
45
  name: Name of the deployed API. Defaults to the name of the API template.
46
46
  teamspace: Teamspace to deploy the API to. Defaults to user's default teamspace.
47
47
  org: Organization to deploy the API to. Defaults to user's default organization.
48
48
  """
49
- return self._hub.run(api_id, cluster=cluster, name=name, teamspace=teamspace, org=org)
49
+ return self._hub.run(api_id, cloud_account=cloud_account, name=name, teamspace=teamspace, org=org)
@@ -6,6 +6,7 @@ from lightning_sdk.cli.ai_hub import _AIHub
6
6
  from lightning_sdk.cli.download import _Downloads
7
7
  from lightning_sdk.cli.legacy import _LegacyLightningCLI
8
8
  from lightning_sdk.cli.run import _Run
9
+ from lightning_sdk.cli.serve import _Docker, _LitServe
9
10
  from lightning_sdk.cli.upload import _Uploads
10
11
  from lightning_sdk.lightning_cloud.login import Auth
11
12
 
@@ -19,8 +20,9 @@ class StudioCLI:
19
20
  self.download = _Downloads()
20
21
  self.upload = _Uploads()
21
22
  self.aihub = _AIHub()
22
-
23
23
  self.run = _Run(legacy_run=_LegacyLightningCLI() if _LIGHTNING_AVAILABLE else None)
24
+ self.serve = _LitServe()
25
+ self.dockerize = _Docker()
24
26
 
25
27
  def login(self) -> None:
26
28
  """Login to Lightning AI Studios."""
lightning_sdk/cli/run.py CHANGED
@@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Dict, Optional
2
2
 
3
3
  from lightning_sdk.job import Job
4
4
  from lightning_sdk.machine import Machine
5
+ from lightning_sdk.mmt import MMT
6
+ from lightning_sdk.teamspace import Teamspace
5
7
 
6
8
  if TYPE_CHECKING:
7
9
  from lightning_sdk.cli.legacy import _LegacyLightningCLI
@@ -20,7 +22,7 @@ class _Run:
20
22
  # Need to set the docstring here for f-strings to work.
21
23
  # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
22
24
  # and fire does not show values for literals, just that it is a literal.
23
- docstr = f"""Run async workloads using a docker image or a compute environment from your studio.
25
+ docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
24
26
 
25
27
  Args:
26
28
  name: The name of the job. Needs to be unique within the teamspace.
@@ -32,14 +34,15 @@ class _Run:
32
34
  teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
33
35
  org: The organization owning the teamspace (if any). Defaults to the current organization.
34
36
  user: The user owning the teamspace (if any). Defaults to the current user.
35
- cluster: The cluster to run the job on. Defaults to the studio cluster if running with studio compute env.
36
- If not provided will fall back to the teamspaces default cluster.
37
+ cloud_account: The cloud account to run the job on.
38
+ Defaults to the studio cloud account if running with studio compute env.
39
+ If not provided will fall back to the teamspaces default cloud account.
37
40
  env: Environment variables to set inside the job.
38
41
  interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
39
42
  image_credentials: The credentials used to pull the image. Required if the image is private.
40
43
  This should be the name of the respective credentials secret created on the Lightning AI platform.
41
- cluster_auth: Whether to authenticate with the cluster to pull the image.
42
- Required if the registry is part of a cluster provider (e.g. ECR).
44
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
45
+ Required if the registry is part of a cloud provider (e.g. ECR).
43
46
  artifacts_local: The path of inside the docker container, you want to persist images from.
44
47
  CAUTION: When setting this to "/", it will effectively erase your container.
45
48
  Only supported for jobs with a docker image compute environment.
@@ -53,7 +56,47 @@ class _Run:
53
56
  """
54
57
  # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
55
58
  # might need to switch to explicit cli definition
56
- self.job.__func__.__doc__ = docstr
59
+ self.job.__func__.__doc__ = docstr_job
60
+
61
+ # Need to set the docstring here for f-strings to work.
62
+ # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
63
+ # and fire does not show values for literals, just that it is a literal.
64
+ docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
65
+
66
+ Args:
67
+ name: The name of the job. Needs to be unique within the teamspace.
68
+ num_machines: The number of Machines to run on. Defaults to 2 Machines
69
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
70
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
71
+ If not provided for images, will run the container entrypoint and default command.
72
+ studio: The studio env to run the job with. Mutually exclusive with image.
73
+ image: The docker image to run the job with. Mutually exclusive with studio.
74
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
75
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
76
+ user: The user owning the teamspace (if any). Defaults to the current user.
77
+ cloud_account: The cloud account to run the job on.
78
+ Defaults to the studio cloud account if running with studio compute env.
79
+ If not provided will fall back to the teamspaces default cloud account.
80
+ env: Environment variables to set inside the job.
81
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
82
+ image_credentials: The credentials used to pull the image. Required if the image is private.
83
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
84
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
85
+ Required if the registry is part of a cloud provider (e.g. ECR).
86
+ artifacts_local: The path of inside the docker container, you want to persist images from.
87
+ CAUTION: When setting this to "/", it will effectively erase your container.
88
+ Only supported for jobs with a docker image compute environment.
89
+ artifacts_remote: The remote storage to persist your artifacts to.
90
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
91
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
92
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
93
+ within it.
94
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
95
+ Only supported for jobs with a docker image compute environment.
96
+ """
97
+ # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
98
+ # might need to switch to explicit cli definition
99
+ self.mmt.__func__.__doc__ = docstr_mmt
57
100
 
58
101
  # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
59
102
  # see https://github.com/google/python-fire/pull/513
@@ -61,21 +104,30 @@ class _Run:
61
104
  def job(
62
105
  self,
63
106
  name: str,
64
- machine: str,
107
+ machine: Optional[str] = None,
65
108
  command: Optional[str] = None,
66
109
  studio: Optional[str] = None,
67
110
  image: Optional[str] = None,
68
111
  teamspace: Optional[str] = None,
69
112
  org: Optional[str] = None,
70
113
  user: Optional[str] = None,
71
- cluster: Optional[str] = None,
114
+ cloud_account: Optional[str] = None,
72
115
  env: Optional[Dict[str, str]] = None,
73
116
  interruptible: bool = False,
74
117
  image_credentials: Optional[str] = None,
75
- cluster_auth: bool = False,
118
+ cloud_account_auth: bool = False,
76
119
  artifacts_local: Optional[str] = None,
77
120
  artifacts_remote: Optional[str] = None,
78
121
  ) -> None:
122
+ if machine is None:
123
+ # TODO: infer from studio
124
+ machine = "CPU"
125
+ machine_enum = Machine(machine.upper())
126
+
127
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
128
+
129
+ if cloud_account is None:
130
+ cloud_account = resolved_teamspace.default_cloud_account
79
131
  machine_enum = Machine(machine.upper())
80
132
  Job.run(
81
133
  name=name,
@@ -83,14 +135,72 @@ class _Run:
83
135
  command=command,
84
136
  studio=studio,
85
137
  image=image,
86
- teamspace=teamspace,
138
+ teamspace=resolved_teamspace,
139
+ org=org,
140
+ user=user,
141
+ cloud_account=cloud_account,
142
+ env=env,
143
+ interruptible=interruptible,
144
+ image_credentials=image_credentials,
145
+ cloud_account_auth=cloud_account_auth,
146
+ artifacts_local=artifacts_local,
147
+ artifacts_remote=artifacts_remote,
148
+ )
149
+
150
+ # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
151
+ # see https://github.com/google/python-fire/pull/513
152
+ # might need to move to different cli library
153
+ def mmt(
154
+ self,
155
+ name: Optional[str] = None,
156
+ num_machines: int = 2,
157
+ machine: Optional[str] = None,
158
+ command: Optional[str] = None,
159
+ image: Optional[str] = None,
160
+ teamspace: Optional[str] = None,
161
+ org: Optional[str] = None,
162
+ user: Optional[str] = None,
163
+ cloud_account: Optional[str] = None,
164
+ env: Optional[Dict[str, str]] = None,
165
+ interruptible: bool = False,
166
+ image_credentials: Optional[str] = None,
167
+ cloud_account_auth: bool = False,
168
+ artifacts_local: Optional[str] = None,
169
+ artifacts_remote: Optional[str] = None,
170
+ ) -> None:
171
+ if name is None:
172
+ from datetime import datetime
173
+
174
+ timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
175
+ name = f"mmt-{timestr}"
176
+
177
+ if machine is None:
178
+ # TODO: infer from studio
179
+ machine = "CPU"
180
+ machine_enum = Machine(machine.upper())
181
+
182
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
183
+ if cloud_account is None:
184
+ cloud_account = resolved_teamspace.default_cloud_account
185
+
186
+ if image is None:
187
+ raise RuntimeError("Image needs to be specified to run a multi-machine job")
188
+
189
+ MMT.run(
190
+ name=name,
191
+ num_machines=num_machines,
192
+ machine=machine_enum,
193
+ command=command,
194
+ studio=None,
195
+ image=image,
196
+ teamspace=resolved_teamspace,
87
197
  org=org,
88
198
  user=user,
89
- cluster=cluster,
199
+ cloud_account=cloud_account,
90
200
  env=env,
91
201
  interruptible=interruptible,
92
202
  image_credentials=image_credentials,
93
- cluster_auth=cluster_auth,
203
+ cloud_account_auth=cloud_account_auth,
94
204
  artifacts_local=artifacts_local,
95
205
  artifacts_remote=artifacts_remote,
96
206
  )
@@ -0,0 +1,218 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from pathlib import Path
5
+ from typing import Optional, Union
6
+
7
+ from rich.console import Console
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
9
+ from rich.prompt import Confirm
10
+
11
+
12
+ class _LitServe:
13
+ """Serve a LitServe model.
14
+
15
+ Example:
16
+ lightning serve api server.py # serve locally
17
+ lightning serve api server.py --cloud # deploy to the cloud
18
+
19
+ You can deploy the API to the cloud by running `lightning serve api server.py --cloud`.
20
+ This will generate a Dockerfile, build the image, and push it to the image registry.
21
+ Deploying to the cloud requires pre-login to the docker registry.
22
+ """
23
+
24
+ def api(
25
+ self,
26
+ script_path: Union[str, Path],
27
+ easy: bool = False,
28
+ cloud: bool = False,
29
+ repository: Optional[str] = None,
30
+ non_interactive: bool = False,
31
+ ) -> None:
32
+ """Deploy a LitServe model script.
33
+
34
+ Args:
35
+ script_path: Path to the script to serve
36
+ easy: If True, generates a client for the model
37
+ cloud: If True, deploy the model to the Lightning Studio
38
+ repository: Optional Docker repository name (e.g., 'username/model-name')
39
+ non_interactive: If True, do not prompt for confirmation
40
+ Raises:
41
+ FileNotFoundError: If script_path doesn't exist
42
+ ImportError: If litserve is not installed
43
+ subprocess.CalledProcessError: If the script fails to run
44
+ IOError: If client.py generation fails
45
+ """
46
+ console = Console()
47
+ script_path = Path(script_path)
48
+ if not script_path.exists():
49
+ raise FileNotFoundError(f"Script not found: {script_path}")
50
+ if not script_path.is_file():
51
+ raise ValueError(f"Path is not a file: {script_path}")
52
+
53
+ try:
54
+ from litserve.python_client import client_template
55
+ except ImportError:
56
+ raise ImportError(
57
+ "litserve is not installed. Please install it with `pip install lightning_sdk[serve]`"
58
+ ) from None
59
+
60
+ if easy:
61
+ client_path = Path("client.py")
62
+ if client_path.exists():
63
+ console.print("Skipping client generation: client.py already exists", style="blue")
64
+ else:
65
+ try:
66
+ client_path.write_text(client_template)
67
+ console.print("✅ Client generated at client.py", style="bold green")
68
+ except OSError as e:
69
+ raise OSError(f"Failed to generate client.py: {e!s}") from None
70
+
71
+ if cloud:
72
+ tag = repository if repository else "litserve-model"
73
+ return self._handle_cloud(script_path, console, tag=tag, non_interactive=non_interactive)
74
+
75
+ try:
76
+ subprocess.run(
77
+ ["python", str(script_path)],
78
+ check=True,
79
+ text=True,
80
+ )
81
+ except subprocess.CalledProcessError as e:
82
+ error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
83
+ raise RuntimeError(error_msg) from None
84
+
85
+ def _handle_cloud(
86
+ self,
87
+ script_path: Union[str, Path],
88
+ console: Console,
89
+ tag: str = "litserve-model",
90
+ non_interactive: bool = False,
91
+ ) -> None:
92
+ try:
93
+ import docker
94
+ except ImportError:
95
+ raise ImportError("docker-py is not installed. Please install it with `pip install docker`") from None
96
+
97
+ try:
98
+ client = docker.from_env()
99
+ client.ping()
100
+ except docker.errors.DockerException as e:
101
+ raise RuntimeError(f"Failed to connect to Docker daemon: {e!s}. Is Docker running?") from None
102
+
103
+ dockerizer = _Docker()
104
+ path = dockerizer.api(script_path, port=8000, gpu=False, tag=tag)
105
+
106
+ console.clear()
107
+ if non_interactive:
108
+ console.print("[italic]non-interactive[/italic] mode enabled, skipping confirmation prompts", style="blue")
109
+
110
+ console.print(f"\nPlease review the Dockerfile at [u]{path}[/u] and make sure it is correct.", style="bold")
111
+ correct_dockerfile = True if non_interactive else Confirm.ask("Is the Dockerfile correct?", default=True)
112
+ if not correct_dockerfile:
113
+ console.print("Please fix the Dockerfile and try again.", style="red")
114
+ return
115
+
116
+ with Progress(
117
+ SpinnerColumn(),
118
+ TextColumn("[progress.description]{task.description}"),
119
+ TimeElapsedColumn(),
120
+ console=console,
121
+ transient=False,
122
+ ) as progress:
123
+ build_task = progress.add_task("Building Docker image", total=None)
124
+ build_status = client.api.build(
125
+ path=os.path.dirname(path), dockerfile=path, tag=tag, decode=True, quiet=False
126
+ )
127
+ for line in build_status:
128
+ if "error" in line:
129
+ progress.stop()
130
+ console.print(f"\n[red]{line}[/red]")
131
+ return
132
+ if "stream" in line and line["stream"].strip():
133
+ console.print(line["stream"].strip(), style="bright_black")
134
+ progress.update(build_task, description="Building Docker image")
135
+
136
+ progress.update(build_task, description="[green]Build completed![/green]")
137
+
138
+ push_task = progress.add_task("Pushing to registry", total=None)
139
+ console.print("\nPushing image...", style="bold blue")
140
+ push_status = client.api.push(tag, stream=True, decode=True)
141
+ for line in push_status:
142
+ if "error" in line:
143
+ progress.stop()
144
+ console.print(f"\n[red]{line}[/red]")
145
+ return
146
+ if "status" in line:
147
+ console.print(line["status"], style="bright_black")
148
+ progress.update(push_task, description="Pushing to registry")
149
+
150
+ progress.update(push_task, description="[green]Push completed![/green]")
151
+
152
+ console.print(f"\n✅ Image pushed to {tag}", style="bold green")
153
+ console.print(
154
+ "Soon you will be able to deploy this model to the Lightning Studio!",
155
+ )
156
+ # TODO: Deploy to the cloud
157
+
158
+
159
+ class _Docker:
160
+ """Generate a Dockerfile for a LitServe model."""
161
+
162
+ def api(self, server_filename: str, port: int = 8000, gpu: bool = False, tag: str = "litserve-model") -> str:
163
+ """Generate a Dockerfile for the given server code.
164
+
165
+ Args:
166
+ server_filename: The path to the server file. Example sever.py or app.py.
167
+ port: The port to expose in the Docker container.
168
+ gpu: Whether to use a GPU-enabled Docker image.
169
+ tag: Docker image tag to use in examples.
170
+ """
171
+ import litserve as ls
172
+ from litserve import docker_builder
173
+
174
+ console = Console()
175
+ requirements = ""
176
+ if os.path.exists("requirements.txt"):
177
+ requirements = "-r requirements.txt"
178
+ else:
179
+ warnings.warn(
180
+ f"requirements.txt not found at {os.getcwd()}. "
181
+ f"Make sure to install the required packages in the Dockerfile.",
182
+ UserWarning,
183
+ )
184
+
185
+ current_dir = Path.cwd()
186
+ if not (current_dir / server_filename).is_file():
187
+ raise FileNotFoundError(f"Server file `{server_filename}` must be in the current directory: {os.getcwd()}")
188
+
189
+ version = ls.__version__
190
+ if gpu:
191
+ run_cmd = f"docker run --gpus all -p {port}:{port} {tag}:latest"
192
+ docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
193
+ else:
194
+ run_cmd = f"docker run -p {port}:{port} {tag}:latest"
195
+ docker_template = docker_builder.DOCKERFILE_TEMPLATE
196
+ dockerfile_content = docker_template.format(
197
+ server_filename=server_filename,
198
+ port=port,
199
+ version=version,
200
+ requirements=requirements,
201
+ )
202
+ with open("Dockerfile", "w") as f:
203
+ f.write(dockerfile_content)
204
+
205
+ success_msg = f"""[bold]Dockerfile created successfully[/bold]
206
+ Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
207
+
208
+ [bold]Build the container with:[/bold]
209
+ > [underline]docker build -t {tag} .[/underline]
210
+
211
+ [bold]To run the Docker container on the machine:[/bold]
212
+ > [underline]{run_cmd}[/underline]
213
+
214
+ [bold]To push the container to a registry:[/bold]
215
+ > [underline]docker push {tag}[/underline]
216
+ """
217
+ console.print(success_msg)
218
+ return os.path.abspath("Dockerfile")