lightning-sdk 0.1.42__py3-none-any.whl → 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lightning_sdk/__init__.py CHANGED
@@ -27,5 +27,5 @@ __all__ = [
27
27
  "AIHub",
28
28
  ]
29
29
 
30
- __version__ = "0.1.42"
30
+ __version__ = "0.1.43"
31
31
  _check_version_and_prompt_upgrade(__version__)
lightning_sdk/cli/run.py CHANGED
@@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Dict, Optional
2
2
 
3
3
  from lightning_sdk.job import Job
4
4
  from lightning_sdk.machine import Machine
5
+ from lightning_sdk.mmt import MMT
6
+ from lightning_sdk.teamspace import Teamspace
5
7
 
6
8
  if TYPE_CHECKING:
7
9
  from lightning_sdk.cli.legacy import _LegacyLightningCLI
@@ -20,7 +22,7 @@ class _Run:
20
22
  # Need to set the docstring here for f-strings to work.
21
23
  # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
22
24
  # and fire does not show values for literals, just that it is a literal.
23
- docstr = f"""Run async workloads using a docker image or a compute environment from your studio.
25
+ docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
24
26
 
25
27
  Args:
26
28
  name: The name of the job. Needs to be unique within the teamspace.
@@ -54,7 +56,47 @@ class _Run:
54
56
  """
55
57
  # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
56
58
  # might need to switch to explicit cli definition
57
- self.job.__func__.__doc__ = docstr
59
+ self.job.__func__.__doc__ = docstr_job
60
+
61
+ # Need to set the docstring here for f-strings to work.
62
+ # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
63
+ # and fire does not show values for literals, just that it is a literal.
64
+ docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
65
+
66
+ Args:
67
+ name: The name of the job. Needs to be unique within the teamspace.
68
+ num_machines: The number of Machines to run on. Defaults to 2 Machines
69
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
70
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
71
+ If not provided for images, will run the container entrypoint and default command.
72
+ studio: The studio env to run the job with. Mutually exclusive with image.
73
+ image: The docker image to run the job with. Mutually exclusive with studio.
74
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
75
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
76
+ user: The user owning the teamspace (if any). Defaults to the current user.
77
+ cloud_account: The cloud account to run the job on.
78
+ Defaults to the studio cloud account if running with studio compute env.
79
+ If not provided will fall back to the teamspaces default cloud account.
80
+ env: Environment variables to set inside the job.
81
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
82
+ image_credentials: The credentials used to pull the image. Required if the image is private.
83
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
84
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
85
+ Required if the registry is part of a cloud provider (e.g. ECR).
86
+ artifacts_local: The path of inside the docker container, you want to persist images from.
87
+ CAUTION: When setting this to "/", it will effectively erase your container.
88
+ Only supported for jobs with a docker image compute environment.
89
+ artifacts_remote: The remote storage to persist your artifacts to.
90
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
91
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
92
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
93
+ within it.
94
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
95
+ Only supported for jobs with a docker image compute environment.
96
+ """
97
+ # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
98
+ # might need to switch to explicit cli definition
99
+ self.mmt.__func__.__doc__ = docstr_mmt
58
100
 
59
101
  # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
60
102
  # see https://github.com/google/python-fire/pull/513
@@ -62,7 +104,7 @@ class _Run:
62
104
  def job(
63
105
  self,
64
106
  name: str,
65
- machine: str,
107
+ machine: Optional[str] = None,
66
108
  command: Optional[str] = None,
67
109
  studio: Optional[str] = None,
68
110
  image: Optional[str] = None,
@@ -77,6 +119,15 @@ class _Run:
77
119
  artifacts_local: Optional[str] = None,
78
120
  artifacts_remote: Optional[str] = None,
79
121
  ) -> None:
122
+ if machine is None:
123
+ # TODO: infer from studio
124
+ machine = "CPU"
125
+ machine_enum = Machine(machine.upper())
126
+
127
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
128
+
129
+ if cloud_account is None:
130
+ cloud_account = resolved_teamspace.default_cloud_account
80
131
  machine_enum = Machine(machine.upper())
81
132
  Job.run(
82
133
  name=name,
@@ -84,7 +135,65 @@ class _Run:
84
135
  command=command,
85
136
  studio=studio,
86
137
  image=image,
87
- teamspace=teamspace,
138
+ teamspace=resolved_teamspace,
139
+ org=org,
140
+ user=user,
141
+ cloud_account=cloud_account,
142
+ env=env,
143
+ interruptible=interruptible,
144
+ image_credentials=image_credentials,
145
+ cloud_account_auth=cloud_account_auth,
146
+ artifacts_local=artifacts_local,
147
+ artifacts_remote=artifacts_remote,
148
+ )
149
+
150
+ # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
151
+ # see https://github.com/google/python-fire/pull/513
152
+ # might need to move to different cli library
153
+ def mmt(
154
+ self,
155
+ name: Optional[str] = None,
156
+ num_machines: int = 2,
157
+ machine: Optional[str] = None,
158
+ command: Optional[str] = None,
159
+ image: Optional[str] = None,
160
+ teamspace: Optional[str] = None,
161
+ org: Optional[str] = None,
162
+ user: Optional[str] = None,
163
+ cloud_account: Optional[str] = None,
164
+ env: Optional[Dict[str, str]] = None,
165
+ interruptible: bool = False,
166
+ image_credentials: Optional[str] = None,
167
+ cloud_account_auth: bool = False,
168
+ artifacts_local: Optional[str] = None,
169
+ artifacts_remote: Optional[str] = None,
170
+ ) -> None:
171
+ if name is None:
172
+ from datetime import datetime
173
+
174
+ timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
175
+ name = f"mmt-{timestr}"
176
+
177
+ if machine is None:
178
+ # TODO: infer from studio
179
+ machine = "CPU"
180
+ machine_enum = Machine(machine.upper())
181
+
182
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
183
+ if cloud_account is None:
184
+ cloud_account = resolved_teamspace.default_cloud_account
185
+
186
+ if image is None:
187
+ raise RuntimeError("Image needs to be specified to run a multi-machine job")
188
+
189
+ MMT.run(
190
+ name=name,
191
+ num_machines=num_machines,
192
+ machine=machine_enum,
193
+ command=command,
194
+ studio=None,
195
+ image=image,
196
+ teamspace=resolved_teamspace,
88
197
  org=org,
89
198
  user=user,
90
199
  cloud_account=cloud_account,
@@ -2,29 +2,41 @@ import os
2
2
  import subprocess
3
3
  import warnings
4
4
  from pathlib import Path
5
- from typing import Union
5
+ from typing import Optional, Union
6
6
 
7
7
  from rich.console import Console
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
9
+ from rich.prompt import Confirm
8
10
 
9
11
 
10
12
  class _LitServe:
11
13
  """Serve a LitServe model.
12
14
 
13
15
  Example:
14
- lightning serve api server.py
16
+ lightning serve api server.py # serve locally
17
+ lightning serve api server.py --cloud # deploy to the cloud
18
+
19
+ You can deploy the API to the cloud by running `lightning serve api server.py --cloud`.
20
+ This will generate a Dockerfile, build the image, and push it to the image registry.
21
+ Deploying to the cloud requires pre-login to the docker registry.
15
22
  """
16
23
 
17
24
  def api(
18
25
  self,
19
26
  script_path: Union[str, Path],
20
27
  easy: bool = False,
28
+ cloud: bool = False,
29
+ repository: Optional[str] = None,
30
+ non_interactive: bool = False,
21
31
  ) -> None:
22
32
  """Deploy a LitServe model script.
23
33
 
24
34
  Args:
25
35
  script_path: Path to the script to serve
26
36
  easy: If True, generates a client for the model
27
-
37
+ cloud: If True, deploy the model to the Lightning Studio
38
+ repository: Optional Docker repository name (e.g., 'username/model-name')
39
+ non_interactive: If True, do not prompt for confirmation
28
40
  Raises:
29
41
  FileNotFoundError: If script_path doesn't exist
30
42
  ImportError: If litserve is not installed
@@ -56,6 +68,10 @@ class _LitServe:
56
68
  except OSError as e:
57
69
  raise OSError(f"Failed to generate client.py: {e!s}") from None
58
70
 
71
+ if cloud:
72
+ tag = repository if repository else "litserve-model"
73
+ return self._handle_cloud(script_path, console, tag=tag, non_interactive=non_interactive)
74
+
59
75
  try:
60
76
  subprocess.run(
61
77
  ["python", str(script_path)],
@@ -66,20 +82,91 @@ class _LitServe:
66
82
  error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
67
83
  raise RuntimeError(error_msg) from None
68
84
 
85
+ def _handle_cloud(
86
+ self,
87
+ script_path: Union[str, Path],
88
+ console: Console,
89
+ tag: str = "litserve-model",
90
+ non_interactive: bool = False,
91
+ ) -> None:
92
+ try:
93
+ import docker
94
+ except ImportError:
95
+ raise ImportError("docker-py is not installed. Please install it with `pip install docker`") from None
96
+
97
+ try:
98
+ client = docker.from_env()
99
+ client.ping()
100
+ except docker.errors.DockerException as e:
101
+ raise RuntimeError(f"Failed to connect to Docker daemon: {e!s}. Is Docker running?") from None
102
+
103
+ dockerizer = _Docker()
104
+ path = dockerizer.api(script_path, port=8000, gpu=False, tag=tag)
105
+
106
+ console.clear()
107
+ if non_interactive:
108
+ console.print("[italic]non-interactive[/italic] mode enabled, skipping confirmation prompts", style="blue")
109
+
110
+ console.print(f"\nPlease review the Dockerfile at [u]{path}[/u] and make sure it is correct.", style="bold")
111
+ correct_dockerfile = True if non_interactive else Confirm.ask("Is the Dockerfile correct?", default=True)
112
+ if not correct_dockerfile:
113
+ console.print("Please fix the Dockerfile and try again.", style="red")
114
+ return
115
+
116
+ with Progress(
117
+ SpinnerColumn(),
118
+ TextColumn("[progress.description]{task.description}"),
119
+ TimeElapsedColumn(),
120
+ console=console,
121
+ transient=False,
122
+ ) as progress:
123
+ build_task = progress.add_task("Building Docker image", total=None)
124
+ build_status = client.api.build(
125
+ path=os.path.dirname(path), dockerfile=path, tag=tag, decode=True, quiet=False
126
+ )
127
+ for line in build_status:
128
+ if "error" in line:
129
+ progress.stop()
130
+ console.print(f"\n[red]{line}[/red]")
131
+ return
132
+ if "stream" in line and line["stream"].strip():
133
+ console.print(line["stream"].strip(), style="bright_black")
134
+ progress.update(build_task, description="Building Docker image")
135
+
136
+ progress.update(build_task, description="[green]Build completed![/green]")
137
+
138
+ push_task = progress.add_task("Pushing to registry", total=None)
139
+ console.print("\nPushing image...", style="bold blue")
140
+ push_status = client.api.push(tag, stream=True, decode=True)
141
+ for line in push_status:
142
+ if "error" in line:
143
+ progress.stop()
144
+ console.print(f"\n[red]{line}[/red]")
145
+ return
146
+ if "status" in line:
147
+ console.print(line["status"], style="bright_black")
148
+ progress.update(push_task, description="Pushing to registry")
149
+
150
+ progress.update(push_task, description="[green]Push completed![/green]")
151
+
152
+ console.print(f"\n✅ Image pushed to {tag}", style="bold green")
153
+ console.print(
154
+ "Soon you will be able to deploy this model to the Lightning Studio!",
155
+ )
156
+ # TODO: Deploy to the cloud
157
+
69
158
 
70
159
  class _Docker:
71
160
  """Generate a Dockerfile for a LitServe model."""
72
161
 
73
- def api(self, server_filename: str, port: int = 8000, gpu: bool = False) -> None:
162
+ def api(self, server_filename: str, port: int = 8000, gpu: bool = False, tag: str = "litserve-model") -> str:
74
163
  """Generate a Dockerfile for the given server code.
75
164
 
76
- Example:
77
- lightning litserve dockerize server.py --port 8000 --gpu
78
-
79
165
  Args:
80
- server_filename (str): The path to the server file. Example sever.py or app.py.
81
- port (int, optional): The port to expose in the Docker container.
82
- gpu (bool, optional): Whether to use a GPU-enabled Docker image.
166
+ server_filename: The path to the server file. Example sever.py or app.py.
167
+ port: The port to expose in the Docker container.
168
+ gpu: Whether to use a GPU-enabled Docker image.
169
+ tag: Docker image tag to use in examples.
83
170
  """
84
171
  import litserve as ls
85
172
  from litserve import docker_builder
@@ -101,10 +188,10 @@ class _Docker:
101
188
 
102
189
  version = ls.__version__
103
190
  if gpu:
104
- run_cmd = f"docker run --gpus all -p {port}:{port} litserve-model:latest"
191
+ run_cmd = f"docker run --gpus all -p {port}:{port} {tag}:latest"
105
192
  docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
106
193
  else:
107
- run_cmd = f"docker run -p {port}:{port} litserve-model:latest"
194
+ run_cmd = f"docker run -p {port}:{port} {tag}:latest"
108
195
  docker_template = docker_builder.DOCKERFILE_TEMPLATE
109
196
  dockerfile_content = docker_template.format(
110
197
  server_filename=server_filename,
@@ -119,12 +206,13 @@ class _Docker:
119
206
  Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
120
207
 
121
208
  [bold]Build the container with:[/bold]
122
- > [underline]docker build -t litserve-model .[/underline]
209
+ > [underline]docker build -t {tag} .[/underline]
123
210
 
124
211
  [bold]To run the Docker container on the machine:[/bold]
125
212
  > [underline]{run_cmd}[/underline]
126
213
 
127
214
  [bold]To push the container to a registry:[/bold]
128
- > [underline]docker push litserve-model[/underline]
215
+ > [underline]docker push {tag}[/underline]
129
216
  """
130
217
  console.print(success_msg)
218
+ return os.path.abspath("Dockerfile")
lightning_sdk/job/job.py CHANGED
@@ -25,6 +25,8 @@ def _has_jobs_v2() -> bool:
25
25
 
26
26
 
27
27
  class Job(_BaseJob):
28
+ """Class to submit and manage single-machine jobs on the Lightning AI Platform."""
29
+
28
30
  def __init__(
29
31
  self,
30
32
  name: str,
@@ -34,6 +36,15 @@ class Job(_BaseJob):
34
36
  *,
35
37
  _fetch_job: bool = True,
36
38
  ) -> None:
39
+ """Fetch already existing jobs.
40
+
41
+ Args:
42
+ name: the name of the job
43
+ teamspace: the teamspace the job is part of
44
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
45
+ user: the name of the user owning the :param`teamspace`
46
+ in case it is owned directly by a user instead of an org.
47
+ """
37
48
  internal_job_cls = _JobV2 if _has_jobs_v2() else _JobV1
38
49
 
39
50
  self._internal_job = internal_job_cls(
@@ -1,3 +1,4 @@
1
+ from lightning_sdk.mmt.base import MMTMachine
1
2
  from lightning_sdk.mmt.mmt import MMT
2
3
 
3
- __all__ = ["MMT"]
4
+ __all__ = ["MMT", "MMTMachine"]
lightning_sdk/mmt/base.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from abc import abstractmethod
2
- from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
2
+ from typing import TYPE_CHECKING, Dict, Optional, Protocol, Tuple, Union
3
3
 
4
4
  if TYPE_CHECKING:
5
5
  from lightning_sdk.machine import Machine
@@ -10,11 +10,36 @@ if TYPE_CHECKING:
10
10
  from lightning_sdk.user import User
11
11
 
12
12
  from lightning_sdk.job.base import _BaseJob
13
- from lightning_sdk.job.job import Job
14
13
  from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
15
14
 
16
15
 
16
+ class MMTMachine(Protocol):
17
+ """A single machine in multi-machine training."""
18
+
19
+ @property
20
+ def name(self) -> str:
21
+ """The Name of the individual machine. Usually corresponds to the rank."""
22
+ ...
23
+
24
+ @property
25
+ def machine(self) -> "Machine":
26
+ """The actual machine type this node is running on."""
27
+ ...
28
+
29
+ @property
30
+ def artifact_path(self) -> Optional[str]:
31
+ """The path to the artifacts of this job."""
32
+ ...
33
+
34
+ @property
35
+ def status(self) -> "Status":
36
+ """The status of this job."""
37
+ ...
38
+
39
+
17
40
  class _BaseMMT(_BaseJob):
41
+ """Base interface to all job types."""
42
+
18
43
  @classmethod
19
44
  def run(
20
45
  cls,
@@ -36,6 +61,39 @@ class _BaseMMT(_BaseJob):
36
61
  artifacts_remote: Optional[str] = None,
37
62
  cluster: Optional[str] = None, # deprecated in favor of cloud_account
38
63
  ) -> "_BaseMMT":
64
+ """Run async workloads using a docker image across multiple machines.
65
+
66
+ Args:
67
+ name: The name of the job. Needs to be unique within the teamspace.
68
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
69
+ num_machine: The number of machines to run on.
70
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
71
+ If not provided for images, will run the container entrypoint and default command.
72
+ studio: The studio env to run the job with. Mutually exclusive with image.
73
+ image: The docker image to run the job with. Mutually exclusive with studio.
74
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
75
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
76
+ user: The user owning the teamspace (if any). Defaults to the current user.
77
+ cloud_account: The cloud account to run the job on.
78
+ Defaults to the studio cloud account if running with studio compute env.
79
+ If not provided will fall back to the teamspaces default cloud account.
80
+ env: Environment variables to set inside the job.
81
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
82
+ image_credentials: The credentials used to pull the image. Required if the image is private.
83
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
84
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
85
+ Required if the registry is part of a cloud provider (e.g. ECR).
86
+ artifacts_local: The path of inside the docker container, you want to persist images from.
87
+ CAUTION: When setting this to "/", it will effectively erase your container.
88
+ Only supported for jobs with a docker image compute environment.
89
+ artifacts_remote: The remote storage to persist your artifacts to.
90
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
91
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
92
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
93
+ within it.
94
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
95
+ Only supported for jobs with a docker image compute environment.
96
+ """
39
97
  from lightning_sdk.studio import Studio
40
98
 
41
99
  cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
@@ -134,53 +192,87 @@ class _BaseMMT(_BaseJob):
134
192
  artifacts_local: Optional[str] = None,
135
193
  artifacts_remote: Optional[str] = None,
136
194
  ) -> None:
137
- """Submits a job and updates the internal _job attribute as well as the _name attribute."""
195
+ """Submit a new multi-machine job to the Lightning AI platform.
196
+
197
+ Args:
198
+ num_machines: The number of machines to run on.
199
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
200
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
201
+ If not provided for images, will run the container entrypoint and default command.
202
+ studio: The studio env to run the job with. Mutually exclusive with image.
203
+ image: The docker image to run the job with. Mutually exclusive with studio.
204
+ env: Environment variables to set inside the job.
205
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
206
+ cloud_account: The cloud account to run the job on.
207
+ Defaults to the studio cloud account if running with studio compute env.
208
+ If not provided will fall back to the teamspaces default cloud account.
209
+ image_credentials: The credentials used to pull the image. Required if the image is private.
210
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
211
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
212
+ Required if the registry is part of a cloud provider (e.g. ECR).
213
+ artifacts_local: The path of inside the docker container, you want to persist images from.
214
+ CAUTION: When setting this to "/", it will effectively erase your container.
215
+ Only supported for jobs with a docker image compute environment.
216
+ artifacts_remote: The remote storage to persist your artifacts to.
217
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
218
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
219
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
220
+ within it.
221
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
222
+ Only supported for jobs with a docker image compute environment.
223
+ """
138
224
 
139
225
  @property
140
226
  @abstractmethod
141
- def machines(self) -> Tuple["Job", ...]:
142
- pass
227
+ def machines(self) -> Tuple[MMTMachine, ...]:
228
+ """Returns the sub-jobs for each individual instance."""
143
229
 
144
230
  @property
145
231
  @abstractmethod
146
232
  def machine(self) -> "Machine":
147
- pass
233
+ """Returns the machine type this job is running on."""
148
234
 
149
235
  @abstractmethod
150
236
  def stop(self) -> None:
151
- pass
237
+ """Stops the job."""
152
238
 
153
239
  @abstractmethod
154
240
  def delete(self) -> None:
155
- pass
241
+ """Deletes the job.
242
+
243
+ Caution: This also deletes all artifacts and snapshots associated with the job.
244
+ """
156
245
 
157
246
  @property
158
247
  @abstractmethod
159
248
  def status(self) -> "Status":
160
- pass
249
+ """The current status of the job."""
161
250
 
162
251
  @property
163
252
  @abstractmethod
164
253
  def artifact_path(self) -> Optional[str]:
165
- pass
254
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
166
255
 
167
256
  @property
168
257
  @abstractmethod
169
258
  def snapshot_path(self) -> Optional[str]:
170
- pass
259
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
171
260
 
172
261
  @property
173
262
  def share_path(self) -> Optional[str]:
263
+ """Path to the jobs share path."""
174
264
  return None
175
265
 
176
- @abstractmethod
177
- def _update_internal_job(self) -> None:
178
- pass
179
-
180
266
  @property
181
267
  def name(self) -> str:
268
+ """The job's name."""
182
269
  return self._name
183
270
 
184
271
  @property
185
272
  def teamspace(self) -> "Teamspace":
273
+ """The teamspace the job is part of."""
186
274
  return self._teamspace
275
+
276
+ @abstractmethod
277
+ def _update_internal_job(self) -> None:
278
+ pass
lightning_sdk/mmt/mmt.py CHANGED
@@ -1,7 +1,9 @@
1
- from typing import TYPE_CHECKING, Any, Dict, Optional, Protocol, Tuple, Union
1
+ from functools import lru_cache
2
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
2
3
 
4
+ from lightning_sdk.api.user_api import UserApi
3
5
  from lightning_sdk.job.job import _has_jobs_v2
4
- from lightning_sdk.mmt.base import _BaseMMT
6
+ from lightning_sdk.mmt.base import MMTMachine, _BaseMMT
5
7
  from lightning_sdk.mmt.v1 import _MMTV1
6
8
  from lightning_sdk.mmt.v2 import _MMTV2
7
9
 
@@ -14,27 +16,22 @@ if TYPE_CHECKING:
14
16
  from lightning_sdk.user import User
15
17
 
16
18
 
17
- class MMTMachine(Protocol):
18
- """A single machine in multi-machine training."""
19
+ @lru_cache(maxsize=None)
20
+ def _has_mmt_v2() -> bool:
21
+ # users need both mmtv2 and jobsv2 flags in order for mmtv2 to work correctly
22
+ if not _has_jobs_v2():
23
+ return False
19
24
 
20
- @property
21
- def name(self) -> str:
22
- ...
23
-
24
- @property
25
- def machine(self) -> "Machine":
26
- ...
27
-
28
- @property
29
- def artifact_path(self) -> Optional[str]:
30
- ...
31
-
32
- @property
33
- def status(self) -> "Status":
34
- ...
25
+ api = UserApi()
26
+ try:
27
+ return api._get_feature_flags().mmt_v2
28
+ except Exception:
29
+ return False
35
30
 
36
31
 
37
32
  class MMT(_BaseMMT):
33
+ """Class to submit and manage multi-machine jobs on the Lightning AI Platform."""
34
+
38
35
  _force_v1: (
39
36
  bool
40
37
  ) = False # required for studio plugin still working correctly as v2 currently does not support the studio env
@@ -48,7 +45,16 @@ class MMT(_BaseMMT):
48
45
  *,
49
46
  _fetch_job: bool = True,
50
47
  ) -> None:
51
- internal_mmt_cls = _MMTV2 if _has_jobs_v2() and not self._force_v1 else _MMTV1
48
+ """Fetch already existing jobs.
49
+
50
+ Args:
51
+ name: the name of the job
52
+ teamspace: the teamspace the job is part of
53
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
54
+ user: the name of the user owning the :param`teamspace`
55
+ in case it is owned directly by a user instead of an org.
56
+ """
57
+ internal_mmt_cls = _MMTV2 if _has_mmt_v2() and not self._force_v1 else _MMTV1
52
58
 
53
59
  self._internal_mmt = internal_mmt_cls(
54
60
  name=name,
@@ -79,6 +85,39 @@ class MMT(_BaseMMT):
79
85
  artifacts_remote: Optional[str] = None,
80
86
  cluster: Optional[str] = None, # deprecated in favor of cloud_account
81
87
  ) -> "MMT":
88
+ """Run async workloads using a docker image across multiple machines.
89
+
90
+ Args:
91
+ name: The name of the job. Needs to be unique within the teamspace.
92
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
93
+ num_machine: The number of machines to run on.
94
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
95
+ If not provided for images, will run the container entrypoint and default command.
96
+ studio: The studio env to run the job with. Mutually exclusive with image.
97
+ image: The docker image to run the job with. Mutually exclusive with studio.
98
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
99
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
100
+ user: The user owning the teamspace (if any). Defaults to the current user.
101
+ cloud_account: The cloud account to run the job on.
102
+ Defaults to the studio cloud account if running with studio compute env.
103
+ If not provided will fall back to the teamspaces default cloud account.
104
+ env: Environment variables to set inside the job.
105
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
106
+ image_credentials: The credentials used to pull the image. Required if the image is private.
107
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
108
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
109
+ Required if the registry is part of a cloud provider (e.g. ECR).
110
+ artifacts_local: The path of inside the docker container, you want to persist images from.
111
+ CAUTION: When setting this to "/", it will effectively erase your container.
112
+ Only supported for jobs with a docker image compute environment.
113
+ artifacts_remote: The remote storage to persist your artifacts to.
114
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
115
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
116
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
117
+ within it.
118
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
119
+ Only supported for jobs with a docker image compute environment.
120
+ """
82
121
  ret_val = super().run(
83
122
  name=name,
84
123
  num_machines=num_machines,
@@ -117,6 +156,35 @@ class MMT(_BaseMMT):
117
156
  artifacts_local: Optional[str] = None,
118
157
  artifacts_remote: Optional[str] = None,
119
158
  ) -> "MMT":
159
+ """Submit a new multi-machine job to the Lightning AI platform.
160
+
161
+ Args:
162
+ num_machines: The number of machines to run on.
163
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
164
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
165
+ If not provided for images, will run the container entrypoint and default command.
166
+ studio: The studio env to run the job with. Mutually exclusive with image.
167
+ image: The docker image to run the job with. Mutually exclusive with studio.
168
+ env: Environment variables to set inside the job.
169
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
170
+ cloud_account: The cloud account to run the job on.
171
+ Defaults to the studio cloud account if running with studio compute env.
172
+ If not provided will fall back to the teamspaces default cloud account.
173
+ image_credentials: The credentials used to pull the image. Required if the image is private.
174
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
175
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
176
+ Required if the registry is part of a cloud provider (e.g. ECR).
177
+ artifacts_local: The path of inside the docker container, you want to persist images from.
178
+ CAUTION: When setting this to "/", it will effectively erase your container.
179
+ Only supported for jobs with a docker image compute environment.
180
+ artifacts_remote: The remote storage to persist your artifacts to.
181
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
182
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
183
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
184
+ within it.
185
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
186
+ Only supported for jobs with a docker image compute environment.
187
+ """
120
188
  self._job = self._internal_mmt._submit(
121
189
  num_machines=num_machines,
122
190
  machine=machine,
@@ -134,33 +202,44 @@ class MMT(_BaseMMT):
134
202
  return self
135
203
 
136
204
  def stop(self) -> None:
205
+ """Stops the job."""
137
206
  return self._internal_mmt.stop()
138
207
 
139
208
  def delete(self) -> None:
209
+ """Deletes the job.
210
+
211
+ Caution: This also deletes all artifacts and snapshots associated with the job.
212
+ """
140
213
  return self._internal_mmt.delete()
141
214
 
142
215
  @property
143
216
  def status(self) -> "Status":
217
+ """The current status of the job (accumulated over all machines)."""
144
218
  return self._internal_mmt.status
145
219
 
146
220
  @property
147
221
  def machines(self) -> Tuple[MMTMachine, ...]:
222
+ """Returns the sub-jobs for each individual instance."""
148
223
  return self._internal_mmt.machines
149
224
 
150
225
  @property
151
226
  def machine(self) -> "Machine":
227
+ """Returns the machine type this job is running on."""
152
228
  return self._internal_mmt.machine
153
229
 
154
230
  @property
155
231
  def artifact_path(self) -> Optional[str]:
232
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
156
233
  return self._internal_mmt.artifact_path
157
234
 
158
235
  @property
159
236
  def snapshot_path(self) -> Optional[str]:
237
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
160
238
  return self._internal_mmt.snapshot_path
161
239
 
162
240
  @property
163
241
  def share_path(self) -> Optional[str]:
242
+ """Path to the jobs share path."""
164
243
  return None
165
244
 
166
245
  def _update_internal_job(self) -> None:
@@ -168,16 +247,14 @@ class MMT(_BaseMMT):
168
247
 
169
248
  @property
170
249
  def name(self) -> str:
250
+ """The job's name."""
171
251
  return self._internal_mmt.name
172
252
 
173
253
  @property
174
254
  def teamspace(self) -> "Teamspace":
255
+ """The teamspace the job is part of."""
175
256
  return self._internal_mmt._teamspace
176
257
 
177
- @property
178
- def cloud_account(self) -> Optional[str]:
179
- return self._internal_mmt.cloud_account
180
-
181
258
  def __getattr__(self, key: str) -> Any:
182
259
  """Forward the attribute lookup to the internal job implementation."""
183
260
  try:
lightning_sdk/mmt/v1.py CHANGED
@@ -16,6 +16,8 @@ from lightning_sdk.mmt.base import _BaseMMT
16
16
 
17
17
 
18
18
  class _MMTV1(_BaseMMT):
19
+ """V1 Implementation of Multi-Machine Training."""
20
+
19
21
  def __init__(
20
22
  self,
21
23
  name: str,
@@ -25,6 +27,15 @@ class _MMTV1(_BaseMMT):
25
27
  *,
26
28
  _fetch_job: bool = True,
27
29
  ) -> None:
30
+ """Fetch already existing jobs.
31
+
32
+ Args:
33
+ name: the name of the job
34
+ teamspace: the teamspace the job is part of
35
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
36
+ user: the name of the user owning the :param`teamspace`
37
+ in case it is owned directly by a user instead of an org.
38
+ """
28
39
  self._job_api = MMTApiV1()
29
40
  super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
30
41
 
@@ -43,6 +54,35 @@ class _MMTV1(_BaseMMT):
43
54
  artifacts_local: Optional[str] = None,
44
55
  artifacts_remote: Optional[str] = None,
45
56
  ) -> "_MMTV1":
57
+ """Submit a new multi-machine job to the Lightning AI platform.
58
+
59
+ Args:
60
+ num_machines: The number of machines to run on.
61
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
62
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
63
+ If not provided for images, will run the container entrypoint and default command.
64
+ studio: The studio env to run the job with. Mutually exclusive with image.
65
+ image: The docker image to run the job with. Mutually exclusive with studio.
66
+ env: Environment variables to set inside the job.
67
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
68
+ cloud_account: The cloud account to run the job on.
69
+ Defaults to the studio cloud account if running with studio compute env.
70
+ If not provided will fall back to the teamspaces default cloud account.
71
+ image_credentials: The credentials used to pull the image. Required if the image is private.
72
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
73
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
74
+ Required if the registry is part of a cloud provider (e.g. ECR).
75
+ artifacts_local: The path of inside the docker container, you want to persist images from.
76
+ CAUTION: When setting this to "/", it will effectively erase your container.
77
+ Only supported for jobs with a docker image compute environment.
78
+ artifacts_remote: The remote storage to persist your artifacts to.
79
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
80
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
81
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
82
+ within it.
83
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
84
+ Only supported for jobs with a docker image compute environment.
85
+ """
46
86
  if studio is None:
47
87
  raise ValueError("Studio is required for submitting jobs")
48
88
  if image is not None or image_credentials is not None or cloud_account_auth:
@@ -80,18 +120,25 @@ class _MMTV1(_BaseMMT):
80
120
 
81
121
  @property
82
122
  def machines(self) -> Tuple["Work", ...]:
123
+ """Returns the sub-jobs for each individual instance."""
83
124
  works = self._job_api.list_works(self._guaranteed_job.id, self.teamspace.id)
84
125
 
85
126
  return tuple(Work(w.id, self, self.teamspace) for w in works)
86
127
 
87
128
  def stop(self) -> None:
129
+ """Stops the job."""
88
130
  self._job_api.stop_job(self._guaranteed_job.id, self.teamspace.id)
89
131
 
90
132
  def delete(self) -> None:
133
+ """Deletes the job.
134
+
135
+ Caution: This also deletes all artifacts and snapshots associated with the job.
136
+ """
91
137
  self._job_api.delete_job(self._guaranteed_job.id, self.teamspace.id)
92
138
 
93
139
  @property
94
140
  def status(self) -> "Status":
141
+ """The current status of the job."""
95
142
  try:
96
143
  status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
97
144
  return _internal_status_to_external_status(status)
@@ -102,22 +149,27 @@ class _MMTV1(_BaseMMT):
102
149
 
103
150
  @property
104
151
  def artifact_path(self) -> Optional[str]:
152
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
105
153
  return f"/teamspace/jobs/{self.name}"
106
154
 
107
155
  @property
108
156
  def snapshot_path(self) -> Optional[str]:
157
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
109
158
  return f"/teamspace/jobs/{self.name}/snapshot"
110
159
 
111
160
  @property
112
161
  def machine(self) -> "Machine":
162
+ """Returns the machine type this job is running on."""
113
163
  return self.machines[0].machine
114
164
 
115
165
  @property
116
166
  def name(self) -> str:
167
+ """The job's name."""
117
168
  return self._name
118
169
 
119
170
  @property
120
171
  def teamspace(self) -> "Teamspace":
172
+ """The teamspace the job is part of."""
121
173
  return self._teamspace
122
174
 
123
175
  # the following and functions are solely to make the Work class function
lightning_sdk/mmt/v2.py CHANGED
@@ -15,6 +15,8 @@ from lightning_sdk.mmt.base import _BaseMMT
15
15
 
16
16
 
17
17
  class _MMTV2(_BaseMMT):
18
+ """New implementation of Multi-Machine Training."""
19
+
18
20
  def __init__(
19
21
  self,
20
22
  name: str,
@@ -24,6 +26,15 @@ class _MMTV2(_BaseMMT):
24
26
  *,
25
27
  _fetch_job: bool = True,
26
28
  ) -> None:
29
+ """Fetch already existing jobs.
30
+
31
+ Args:
32
+ name: the name of the job
33
+ teamspace: the teamspace the job is part of
34
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
35
+ user: the name of the user owning the :param`teamspace`
36
+ in case it is owned directly by a user instead of an org.
37
+ """
27
38
  self._job_api = MMTApiV2()
28
39
  super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
29
40
 
@@ -42,6 +53,35 @@ class _MMTV2(_BaseMMT):
42
53
  artifacts_local: Optional[str] = None,
43
54
  artifacts_remote: Optional[str] = None,
44
55
  ) -> "_MMTV2":
56
+ """Submit a new multi-machine job to the Lightning AI platform.
57
+
58
+ Args:
59
+ num_machines: The number of machines to run on.
60
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
61
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
62
+ If not provided for images, will run the container entrypoint and default command.
63
+ studio: The studio env to run the job with. Mutually exclusive with image.
64
+ image: The docker image to run the job with. Mutually exclusive with studio.
65
+ env: Environment variables to set inside the job.
66
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
67
+ cloud_account: The cloud account to run the job on.
68
+ Defaults to the studio cloud account if running with studio compute env.
69
+ If not provided will fall back to the teamspaces default cloud account.
70
+ image_credentials: The credentials used to pull the image. Required if the image is private.
71
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
72
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
73
+ Required if the registry is part of a cloud provider (e.g. ECR).
74
+ artifacts_local: The path of inside the docker container, you want to persist images from.
75
+ CAUTION: When setting this to "/", it will effectively erase your container.
76
+ Only supported for jobs with a docker image compute environment.
77
+ artifacts_remote: The remote storage to persist your artifacts to.
78
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
79
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
80
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
81
+ within it.
82
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
83
+ Only supported for jobs with a docker image compute environment.
84
+ """
45
85
  # Command is required if Studio is provided to know what to run
46
86
  # Image is mutually exclusive with Studio
47
87
  # Command is optional for Image
@@ -80,6 +120,7 @@ class _MMTV2(_BaseMMT):
80
120
 
81
121
  @property
82
122
  def machines(self) -> Tuple["Job", ...]:
123
+ """Returns the sub-jobs for each individual instance."""
83
124
  from lightning_sdk.job import Job
84
125
 
85
126
  return tuple(
@@ -88,9 +129,14 @@ class _MMTV2(_BaseMMT):
88
129
  )
89
130
 
90
131
  def stop(self) -> None:
132
+ """Stops the job."""
91
133
  self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
92
134
 
93
135
  def delete(self) -> None:
136
+ """Deletes the job.
137
+
138
+ Caution: This also deletes all artifacts and snapshots associated with the job.
139
+ """
94
140
  self._job_api.delete_job(
95
141
  job_id=self._guaranteed_job.id,
96
142
  teamspace_id=self._teamspace.id,
@@ -104,20 +150,24 @@ class _MMTV2(_BaseMMT):
104
150
 
105
151
  @property
106
152
  def status(self) -> "Status":
153
+ """The current status of the job."""
107
154
  return self._job_api._job_state_to_external(self._latest_job.state)
108
155
 
109
156
  @property
110
157
  def artifact_path(self) -> Optional[str]:
158
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
111
159
  # TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
112
160
  raise NotImplementedError
113
161
 
114
162
  @property
115
163
  def snapshot_path(self) -> Optional[str]:
164
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
116
165
  # TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
117
166
  raise NotImplementedError
118
167
 
119
168
  @property
120
169
  def machine(self) -> "Machine":
170
+ """Returns the machine type this job is running on."""
121
171
  return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
122
172
 
123
173
  def _update_internal_job(self) -> None:
@@ -129,8 +179,10 @@ class _MMTV2(_BaseMMT):
129
179
 
130
180
  @property
131
181
  def name(self) -> str:
182
+ """The job's name."""
132
183
  return self._name
133
184
 
134
185
  @property
135
186
  def teamspace(self) -> "Teamspace":
187
+ """The teamspace the job is part of."""
136
188
  return self._teamspace
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lightning_sdk
3
- Version: 0.1.42
3
+ Version: 0.1.43
4
4
  Summary: SDK to develop using Lightning AI Studios
5
5
  Author-email: Lightning-AI <justus@lightning.ai>
6
6
  License: MIT License
@@ -47,6 +47,7 @@ Requires-Dist: simple-term-menu
47
47
  Requires-Dist: lightning-utilities
48
48
  Provides-Extra: serve
49
49
  Requires-Dist: litserve>=0.2.5; extra == "serve"
50
+ Requires-Dist: docker; extra == "serve"
50
51
 
51
52
  # Lightning SDK
52
53
 
@@ -1,5 +1,5 @@
1
1
  docs/source/conf.py,sha256=r8yX20eC-4mHhMTd0SbQb5TlSWHhO6wnJ0VJ_FBFpag,13249
2
- lightning_sdk/__init__.py,sha256=Tndl6PUCxWaYJY0gViZXePoopC4k4r3QXpC0MR7r9ZY,925
2
+ lightning_sdk/__init__.py,sha256=je3-DFVBfoAqqPLrFLZmVZGWPNda7886O9-BrY6dvlQ,925
3
3
  lightning_sdk/agents.py,sha256=ly6Ma1j0ZgGPFyvPvMN28JWiB9dATIstFa5XM8pMi6I,1577
4
4
  lightning_sdk/ai_hub.py,sha256=kBjtmrzVHPCgqtV_TrSNkuf4oT2DLm8SYRTz4iTQmmY,6624
5
5
  lightning_sdk/constants.py,sha256=ztl1PTUBULnqTf3DyKUSJaV_O20hNtUYT6XvAYIrmIk,749
@@ -30,16 +30,15 @@ lightning_sdk/cli/download.py,sha256=nyQN3q1vZ0fg4_cfit8cKaokQ9VUd46l_TNcAQWkLwU
30
30
  lightning_sdk/cli/entrypoint.py,sha256=Hl2Lm7-OS0kx_pyJyGe7Nii0Soc6HYe4r4xXKeJuC_o,1507
31
31
  lightning_sdk/cli/exceptions.py,sha256=QUF3OMAMZwBikvlusimSHSBjb6ywvHpfAumJBEaodSw,169
32
32
  lightning_sdk/cli/legacy.py,sha256=ocTVNwlsLRS5aMjbMkwFPjT3uEYvS8C40CJ0PeRRv8g,4707
33
- lightning_sdk/cli/mmt.py,sha256=sRCr9q3XOggaScOeM0O30mZvO6sNQ2hvL3_kuXB5eLY,6327
34
- lightning_sdk/cli/run.py,sha256=K467Do3AekbjHFR9KLIOEfuMrUdyx-JW1Rd4Pvd-KRs,5112
35
- lightning_sdk/cli/serve.py,sha256=dfhbxNscaDJijJSXxpqRKZoI-eGvaIVWKoqTsg_xZWk,4619
33
+ lightning_sdk/cli/run.py,sha256=B6ttd9SKg373ngug-lj74CEcuEoxwz-P6nUBVnQeijI,10836
34
+ lightning_sdk/cli/serve.py,sha256=UaXhGHU6nbAzrnVigSKOTrMjLwSs-sjyhuJCdVUBwzc,8722
36
35
  lightning_sdk/cli/studios_menu.py,sha256=0kQGqGel8gAbpdJtjOM1a6NEat_TnIqRNprNn8QiK58,3236
37
36
  lightning_sdk/cli/upload.py,sha256=H9OyipYTYAQ9Mzy2e8jtoaa-B34-uXHbTQTzY2Vmhv4,9078
38
37
  lightning_sdk/deployment/__init__.py,sha256=BLu7_cVLp97TYxe6qe-J1zKUSZXAVcvCjgcA7plV2k4,497
39
38
  lightning_sdk/deployment/deployment.py,sha256=Dp15pn8rFAfMfaDhKn0v3bphFuvLgkPFs3KSNxW6eyc,15472
40
39
  lightning_sdk/job/__init__.py,sha256=1MxjQ6rHkyUHCypSW9RuXuVMVH11WiqhIXcU2LCFMwE,64
41
40
  lightning_sdk/job/base.py,sha256=I4-iWyiKp1KUkxDy97zJYbwbdQ_7cu6FqCayKwXDloQ,13000
42
- lightning_sdk/job/job.py,sha256=Qh7BHoyNYVn4QotT8suceCsyBtJolYhbUZdTj-zpmns,10574
41
+ lightning_sdk/job/job.py,sha256=pT9rkSsK5BHu6dSyHpAlYfGXvaF0s_XtrjjhOSAVFWU,11070
43
42
  lightning_sdk/job/v1.py,sha256=zIcngaM2_c-2thcKiCOPoWGUuIKc4tasJEwEFJv6bAA,8953
44
43
  lightning_sdk/job/v2.py,sha256=oq54VFInuVV_L-nUO_dnBbn4TxPWiBuIqmdFpNN1LmU,8057
45
44
  lightning_sdk/job/work.py,sha256=PYopS_6c556I2o8ouSXmzb4FGQflzCe06GpqJiCdedw,1604
@@ -834,11 +833,11 @@ lightning_sdk/lightning_cloud/utils/data_connection.py,sha256=VN-Gs0a4g3tA9TQCwP
834
833
  lightning_sdk/lightning_cloud/utils/dataset.py,sha256=4nUspe8iAaRPgSYpXA2uAQCgydm78kJzhOIx3C9qKls,2011
835
834
  lightning_sdk/lightning_cloud/utils/name_generator.py,sha256=MkciuA10332V0mcE2PxLIiwWomWE0Fm_gNGK01vwRr4,58046
836
835
  lightning_sdk/lightning_cloud/utils/network.py,sha256=axPgl8rhyPcPjxiztDxyksfxax3VNg2OXL5F5Uc81b4,406
837
- lightning_sdk/mmt/__init__.py,sha256=-qAR2-NihgWNY2tfi0z7atwDLOLwY86tvmgyHA2tfr8,57
838
- lightning_sdk/mmt/base.py,sha256=9_Lk2aX7HEflDN9ntosDSO5JHRSVVx7w_0EQAjLqKSQ,6411
839
- lightning_sdk/mmt/mmt.py,sha256=xZUuwgxh_6LSrnNQs5K9Rd5htQ9ZSaKnXbJYO979UM8,5720
840
- lightning_sdk/mmt/v1.py,sha256=E0zCwUq628xfiygdQEU10dGT4z1Kp1b39Sg212mJKdM,4489
841
- lightning_sdk/mmt/v2.py,sha256=Mga_LaTr8BM1_6YwnErAE3iq3hJh8r-88hSjN_l9UjE,4772
836
+ lightning_sdk/mmt/__init__.py,sha256=ExMu90-96bGBnyp5h0CErQszUGB1-PcjC4-R8_NYbeY,117
837
+ lightning_sdk/mmt/base.py,sha256=lUB8pAKXTjn_WD6vcJIUMxtZQwrzwxYnaPJbaiPPtMw,12548
838
+ lightning_sdk/mmt/mmt.py,sha256=vMAoR3qjb0VRwCXpKpAMxeTEMALmQB9_RK064fTHM8M,11922
839
+ lightning_sdk/mmt/v1.py,sha256=8LjZnMSGgsGLeajuPto3gknJwVRvsGfkYVoo5A_UlO8,7917
840
+ lightning_sdk/mmt/v2.py,sha256=TsJ8PNluyU0WnNdHvEKWkpJxMwjmyFFJZIaJKCgugIM,8201
842
841
  lightning_sdk/services/__init__.py,sha256=gSWUjccEhMI9CIWL_nbrFHUK2S6TM2725mEzrLMfK1Y,225
843
842
  lightning_sdk/services/file_endpoint.py,sha256=we5HC_o74J4Y6fSP_31jIizi_I_1FO_Rb2qblspD9eE,7855
844
843
  lightning_sdk/services/utilities.py,sha256=IeOx8hc3F8ZevHeKBysh08BXhJliTNzvKp1gwpEfdik,4087
@@ -847,9 +846,9 @@ lightning_sdk/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
847
846
  lightning_sdk/utils/dynamic.py,sha256=glUTO1JC9APtQ6Gr9SO02a3zr56-sPAXM5C3NrTpgyQ,1959
848
847
  lightning_sdk/utils/enum.py,sha256=h2JRzqoBcSlUdanFHmkj_j5DleBHAu1esQYUsdNI-hU,4106
849
848
  lightning_sdk/utils/resolve.py,sha256=RWvlOWLHjaHhR0W0zT3mN719cbzhFfYCKBss38zfv3k,5783
850
- lightning_sdk-0.1.42.dist-info/LICENSE,sha256=uFIuZwj5z-4TeF2UuacPZ1o17HkvKObT8fY50qN84sg,1064
851
- lightning_sdk-0.1.42.dist-info/METADATA,sha256=Jj7QwfnaNqn5u_4muQcqPYx-h2LCwhG4w2aRTHGVeQY,3991
852
- lightning_sdk-0.1.42.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
853
- lightning_sdk-0.1.42.dist-info/entry_points.txt,sha256=ye4ni8VbtyMXt6e0f5xIL6Liucg4Wrl02DEfLjiMte0,106
854
- lightning_sdk-0.1.42.dist-info/top_level.txt,sha256=ps8doKILFXmN7F1mHncShmnQoTxKBRPIcchC8TpoBw4,19
855
- lightning_sdk-0.1.42.dist-info/RECORD,,
849
+ lightning_sdk-0.1.43.dist-info/LICENSE,sha256=uFIuZwj5z-4TeF2UuacPZ1o17HkvKObT8fY50qN84sg,1064
850
+ lightning_sdk-0.1.43.dist-info/METADATA,sha256=qsepNbeCeaApc3hDy1vhjhXs-sIoKUwYfMVK_MaKVUA,4031
851
+ lightning_sdk-0.1.43.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
852
+ lightning_sdk-0.1.43.dist-info/entry_points.txt,sha256=msB9PJWIJ784dX-OP8by51d4IbKYH3Fj1vCuA9oXjHY,68
853
+ lightning_sdk-0.1.43.dist-info/top_level.txt,sha256=ps8doKILFXmN7F1mHncShmnQoTxKBRPIcchC8TpoBw4,19
854
+ lightning_sdk-0.1.43.dist-info/RECORD,,
@@ -1,3 +1,2 @@
1
1
  [console_scripts]
2
- _mmt = lightning_sdk.cli.mmt:main_cli
3
2
  lightning = lightning_sdk.cli.entrypoint:main_cli
lightning_sdk/cli/mmt.py DELETED
@@ -1,138 +0,0 @@
1
- from typing import Dict, Optional
2
-
3
- from fire import Fire
4
-
5
- from lightning_sdk._mmt import MMT
6
- from lightning_sdk.api.studio_api import _cloud_url
7
- from lightning_sdk.lightning_cloud.login import Auth
8
- from lightning_sdk.machine import Machine
9
- from lightning_sdk.teamspace import Teamspace
10
-
11
- _MACHINE_VALUES = tuple([machine.value for machine in Machine])
12
-
13
-
14
- class MMTCLI:
15
- """Command line interface (CLI) to interact with/manage Lightning AI MMT."""
16
-
17
- def __init__(self) -> None:
18
- # Need to set the docstring here for f-strings to work.
19
- # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
20
- # and fire does not show values for literals, just that it is a literal.
21
- docstr = f"""Run async workloads on multiple machines using a docker image.
22
-
23
- Args:
24
- name: The name of the job. Needs to be unique within the teamspace.
25
- num_machines: The number of Machines to run on. Defaults to 2 Machines
26
- machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
27
- command: The command to run inside your job. Required if using a studio. Optional if using an image.
28
- If not provided for images, will run the container entrypoint and default command.
29
- studio: The studio env to run the job with. Mutually exclusive with image.
30
- image: The docker image to run the job with. Mutually exclusive with studio.
31
- teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
32
- org: The organization owning the teamspace (if any). Defaults to the current organization.
33
- user: The user owning the teamspace (if any). Defaults to the current user.
34
- cloud_account: The cloud account to run the job on.
35
- Defaults to the studio cloud account if running with studio compute env.
36
- If not provided will fall back to the teamspaces default cloud account.
37
- env: Environment variables to set inside the job.
38
- interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
39
- image_credentials: The credentials used to pull the image. Required if the image is private.
40
- This should be the name of the respective credentials secret created on the Lightning AI platform.
41
- cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
42
- Required if the registry is part of a cloud provider (e.g. ECR).
43
- artifacts_local: The path of inside the docker container, you want to persist images from.
44
- CAUTION: When setting this to "/", it will effectively erase your container.
45
- Only supported for jobs with a docker image compute environment.
46
- artifacts_remote: The remote storage to persist your artifacts to.
47
- Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
48
- PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
49
- E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
50
- within it.
51
- Note that the connection needs to be added to the teamspace already in order for it to be found.
52
- Only supported for jobs with a docker image compute environment.
53
- """
54
- # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
55
- # might need to switch to explicit cli definition
56
- self.run.__func__.__doc__ = docstr
57
-
58
- def login(self) -> None:
59
- """Login to Lightning AI Studios."""
60
- auth = Auth()
61
- auth.clear()
62
-
63
- try:
64
- auth.authenticate()
65
- except ConnectionError:
66
- raise RuntimeError(f"Unable to connect to {_cloud_url()}. Please check your internet connection.") from None
67
-
68
- def logout(self) -> None:
69
- """Logout from Lightning AI Studios."""
70
- auth = Auth()
71
- auth.clear()
72
-
73
- # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
74
- # see https://github.com/google/python-fire/pull/513
75
- # might need to move to different cli library
76
- def run(
77
- self,
78
- name: Optional[str] = None,
79
- num_machines: int = 2,
80
- machine: Optional[str] = None,
81
- command: Optional[str] = None,
82
- studio: Optional[str] = None,
83
- image: Optional[str] = None,
84
- teamspace: Optional[str] = None,
85
- org: Optional[str] = None,
86
- user: Optional[str] = None,
87
- cloud_account: Optional[str] = None,
88
- env: Optional[Dict[str, str]] = None,
89
- interruptible: bool = False,
90
- image_credentials: Optional[str] = None,
91
- cloud_account_auth: bool = False,
92
- artifacts_local: Optional[str] = None,
93
- artifacts_remote: Optional[str] = None,
94
- ) -> None:
95
- if name is None:
96
- from datetime import datetime
97
-
98
- timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
99
- name = f"mmt-{timestr}"
100
-
101
- if machine is None:
102
- # TODO: infer from studio
103
- machine = "CPU"
104
- machine_enum = Machine(machine.upper())
105
-
106
- teamspace = Teamspace(name=teamspace, org=org, user=user)
107
- if cloud_account is None:
108
- cloud_account = teamspace.default_cloud_account
109
-
110
- if image is None:
111
- raise RuntimeError("Currently only docker images are specified")
112
- MMT.run(
113
- name=name,
114
- num_machines=num_machines,
115
- machine=machine_enum,
116
- command=command,
117
- studio=studio,
118
- image=image,
119
- teamspace=teamspace,
120
- org=org,
121
- user=user,
122
- cloud_account=cloud_account,
123
- env=env,
124
- interruptible=interruptible,
125
- image_credentials=image_credentials,
126
- cloud_account_auth=cloud_account_auth,
127
- artifacts_local=artifacts_local,
128
- artifacts_remote=artifacts_remote,
129
- )
130
-
131
-
132
- def main_cli() -> None:
133
- """CLI entrypoint."""
134
- Fire(MMTCLI(), name="_mmt")
135
-
136
-
137
- if __name__ == "__main__":
138
- main_cli()