lightning-sdk 0.1.42__py3-none-any.whl → 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/cli/run.py +113 -4
- lightning_sdk/cli/serve.py +102 -14
- lightning_sdk/job/job.py +11 -0
- lightning_sdk/mmt/__init__.py +2 -1
- lightning_sdk/mmt/base.py +107 -15
- lightning_sdk/mmt/mmt.py +101 -24
- lightning_sdk/mmt/v1.py +52 -0
- lightning_sdk/mmt/v2.py +52 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +2 -1
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +15 -16
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
- lightning_sdk/cli/mmt.py +0 -138
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
lightning_sdk/__init__.py
CHANGED
lightning_sdk/cli/run.py
CHANGED
|
@@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from lightning_sdk.job import Job
|
|
4
4
|
from lightning_sdk.machine import Machine
|
|
5
|
+
from lightning_sdk.mmt import MMT
|
|
6
|
+
from lightning_sdk.teamspace import Teamspace
|
|
5
7
|
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
@@ -20,7 +22,7 @@ class _Run:
|
|
|
20
22
|
# Need to set the docstring here for f-strings to work.
|
|
21
23
|
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
22
24
|
# and fire does not show values for literals, just that it is a literal.
|
|
23
|
-
|
|
25
|
+
docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
|
|
24
26
|
|
|
25
27
|
Args:
|
|
26
28
|
name: The name of the job. Needs to be unique within the teamspace.
|
|
@@ -54,7 +56,47 @@ class _Run:
|
|
|
54
56
|
"""
|
|
55
57
|
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
56
58
|
# might need to switch to explicit cli definition
|
|
57
|
-
self.job.__func__.__doc__ =
|
|
59
|
+
self.job.__func__.__doc__ = docstr_job
|
|
60
|
+
|
|
61
|
+
# Need to set the docstring here for f-strings to work.
|
|
62
|
+
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
63
|
+
# and fire does not show values for literals, just that it is a literal.
|
|
64
|
+
docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
68
|
+
num_machines: The number of Machines to run on. Defaults to 2 Machines
|
|
69
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
|
|
70
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
71
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
72
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
73
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
74
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
75
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
76
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
77
|
+
cloud_account: The cloud account to run the job on.
|
|
78
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
79
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
80
|
+
env: Environment variables to set inside the job.
|
|
81
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
82
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
83
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
84
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
85
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
86
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
87
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
88
|
+
Only supported for jobs with a docker image compute environment.
|
|
89
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
90
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
91
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
92
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
93
|
+
within it.
|
|
94
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
95
|
+
Only supported for jobs with a docker image compute environment.
|
|
96
|
+
"""
|
|
97
|
+
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
98
|
+
# might need to switch to explicit cli definition
|
|
99
|
+
self.mmt.__func__.__doc__ = docstr_mmt
|
|
58
100
|
|
|
59
101
|
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
60
102
|
# see https://github.com/google/python-fire/pull/513
|
|
@@ -62,7 +104,7 @@ class _Run:
|
|
|
62
104
|
def job(
|
|
63
105
|
self,
|
|
64
106
|
name: str,
|
|
65
|
-
machine: str,
|
|
107
|
+
machine: Optional[str] = None,
|
|
66
108
|
command: Optional[str] = None,
|
|
67
109
|
studio: Optional[str] = None,
|
|
68
110
|
image: Optional[str] = None,
|
|
@@ -77,6 +119,15 @@ class _Run:
|
|
|
77
119
|
artifacts_local: Optional[str] = None,
|
|
78
120
|
artifacts_remote: Optional[str] = None,
|
|
79
121
|
) -> None:
|
|
122
|
+
if machine is None:
|
|
123
|
+
# TODO: infer from studio
|
|
124
|
+
machine = "CPU"
|
|
125
|
+
machine_enum = Machine(machine.upper())
|
|
126
|
+
|
|
127
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
128
|
+
|
|
129
|
+
if cloud_account is None:
|
|
130
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
80
131
|
machine_enum = Machine(machine.upper())
|
|
81
132
|
Job.run(
|
|
82
133
|
name=name,
|
|
@@ -84,7 +135,65 @@ class _Run:
|
|
|
84
135
|
command=command,
|
|
85
136
|
studio=studio,
|
|
86
137
|
image=image,
|
|
87
|
-
teamspace=
|
|
138
|
+
teamspace=resolved_teamspace,
|
|
139
|
+
org=org,
|
|
140
|
+
user=user,
|
|
141
|
+
cloud_account=cloud_account,
|
|
142
|
+
env=env,
|
|
143
|
+
interruptible=interruptible,
|
|
144
|
+
image_credentials=image_credentials,
|
|
145
|
+
cloud_account_auth=cloud_account_auth,
|
|
146
|
+
artifacts_local=artifacts_local,
|
|
147
|
+
artifacts_remote=artifacts_remote,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
151
|
+
# see https://github.com/google/python-fire/pull/513
|
|
152
|
+
# might need to move to different cli library
|
|
153
|
+
def mmt(
|
|
154
|
+
self,
|
|
155
|
+
name: Optional[str] = None,
|
|
156
|
+
num_machines: int = 2,
|
|
157
|
+
machine: Optional[str] = None,
|
|
158
|
+
command: Optional[str] = None,
|
|
159
|
+
image: Optional[str] = None,
|
|
160
|
+
teamspace: Optional[str] = None,
|
|
161
|
+
org: Optional[str] = None,
|
|
162
|
+
user: Optional[str] = None,
|
|
163
|
+
cloud_account: Optional[str] = None,
|
|
164
|
+
env: Optional[Dict[str, str]] = None,
|
|
165
|
+
interruptible: bool = False,
|
|
166
|
+
image_credentials: Optional[str] = None,
|
|
167
|
+
cloud_account_auth: bool = False,
|
|
168
|
+
artifacts_local: Optional[str] = None,
|
|
169
|
+
artifacts_remote: Optional[str] = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
if name is None:
|
|
172
|
+
from datetime import datetime
|
|
173
|
+
|
|
174
|
+
timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
175
|
+
name = f"mmt-{timestr}"
|
|
176
|
+
|
|
177
|
+
if machine is None:
|
|
178
|
+
# TODO: infer from studio
|
|
179
|
+
machine = "CPU"
|
|
180
|
+
machine_enum = Machine(machine.upper())
|
|
181
|
+
|
|
182
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
183
|
+
if cloud_account is None:
|
|
184
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
185
|
+
|
|
186
|
+
if image is None:
|
|
187
|
+
raise RuntimeError("Image needs to be specified to run a multi-machine job")
|
|
188
|
+
|
|
189
|
+
MMT.run(
|
|
190
|
+
name=name,
|
|
191
|
+
num_machines=num_machines,
|
|
192
|
+
machine=machine_enum,
|
|
193
|
+
command=command,
|
|
194
|
+
studio=None,
|
|
195
|
+
image=image,
|
|
196
|
+
teamspace=resolved_teamspace,
|
|
88
197
|
org=org,
|
|
89
198
|
user=user,
|
|
90
199
|
cloud_account=cloud_account,
|
lightning_sdk/cli/serve.py
CHANGED
|
@@ -2,29 +2,41 @@ import os
|
|
|
2
2
|
import subprocess
|
|
3
3
|
import warnings
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Union
|
|
5
|
+
from typing import Optional, Union
|
|
6
6
|
|
|
7
7
|
from rich.console import Console
|
|
8
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
9
|
+
from rich.prompt import Confirm
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class _LitServe:
|
|
11
13
|
"""Serve a LitServe model.
|
|
12
14
|
|
|
13
15
|
Example:
|
|
14
|
-
lightning serve api server.py
|
|
16
|
+
lightning serve api server.py # serve locally
|
|
17
|
+
lightning serve api server.py --cloud # deploy to the cloud
|
|
18
|
+
|
|
19
|
+
You can deploy the API to the cloud by running `lightning serve api server.py --cloud`.
|
|
20
|
+
This will generate a Dockerfile, build the image, and push it to the image registry.
|
|
21
|
+
Deploying to the cloud requires pre-login to the docker registry.
|
|
15
22
|
"""
|
|
16
23
|
|
|
17
24
|
def api(
|
|
18
25
|
self,
|
|
19
26
|
script_path: Union[str, Path],
|
|
20
27
|
easy: bool = False,
|
|
28
|
+
cloud: bool = False,
|
|
29
|
+
repository: Optional[str] = None,
|
|
30
|
+
non_interactive: bool = False,
|
|
21
31
|
) -> None:
|
|
22
32
|
"""Deploy a LitServe model script.
|
|
23
33
|
|
|
24
34
|
Args:
|
|
25
35
|
script_path: Path to the script to serve
|
|
26
36
|
easy: If True, generates a client for the model
|
|
27
|
-
|
|
37
|
+
cloud: If True, deploy the model to the Lightning Studio
|
|
38
|
+
repository: Optional Docker repository name (e.g., 'username/model-name')
|
|
39
|
+
non_interactive: If True, do not prompt for confirmation
|
|
28
40
|
Raises:
|
|
29
41
|
FileNotFoundError: If script_path doesn't exist
|
|
30
42
|
ImportError: If litserve is not installed
|
|
@@ -56,6 +68,10 @@ class _LitServe:
|
|
|
56
68
|
except OSError as e:
|
|
57
69
|
raise OSError(f"Failed to generate client.py: {e!s}") from None
|
|
58
70
|
|
|
71
|
+
if cloud:
|
|
72
|
+
tag = repository if repository else "litserve-model"
|
|
73
|
+
return self._handle_cloud(script_path, console, tag=tag, non_interactive=non_interactive)
|
|
74
|
+
|
|
59
75
|
try:
|
|
60
76
|
subprocess.run(
|
|
61
77
|
["python", str(script_path)],
|
|
@@ -66,20 +82,91 @@ class _LitServe:
|
|
|
66
82
|
error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
|
|
67
83
|
raise RuntimeError(error_msg) from None
|
|
68
84
|
|
|
85
|
+
def _handle_cloud(
|
|
86
|
+
self,
|
|
87
|
+
script_path: Union[str, Path],
|
|
88
|
+
console: Console,
|
|
89
|
+
tag: str = "litserve-model",
|
|
90
|
+
non_interactive: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
try:
|
|
93
|
+
import docker
|
|
94
|
+
except ImportError:
|
|
95
|
+
raise ImportError("docker-py is not installed. Please install it with `pip install docker`") from None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
client = docker.from_env()
|
|
99
|
+
client.ping()
|
|
100
|
+
except docker.errors.DockerException as e:
|
|
101
|
+
raise RuntimeError(f"Failed to connect to Docker daemon: {e!s}. Is Docker running?") from None
|
|
102
|
+
|
|
103
|
+
dockerizer = _Docker()
|
|
104
|
+
path = dockerizer.api(script_path, port=8000, gpu=False, tag=tag)
|
|
105
|
+
|
|
106
|
+
console.clear()
|
|
107
|
+
if non_interactive:
|
|
108
|
+
console.print("[italic]non-interactive[/italic] mode enabled, skipping confirmation prompts", style="blue")
|
|
109
|
+
|
|
110
|
+
console.print(f"\nPlease review the Dockerfile at [u]{path}[/u] and make sure it is correct.", style="bold")
|
|
111
|
+
correct_dockerfile = True if non_interactive else Confirm.ask("Is the Dockerfile correct?", default=True)
|
|
112
|
+
if not correct_dockerfile:
|
|
113
|
+
console.print("Please fix the Dockerfile and try again.", style="red")
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
with Progress(
|
|
117
|
+
SpinnerColumn(),
|
|
118
|
+
TextColumn("[progress.description]{task.description}"),
|
|
119
|
+
TimeElapsedColumn(),
|
|
120
|
+
console=console,
|
|
121
|
+
transient=False,
|
|
122
|
+
) as progress:
|
|
123
|
+
build_task = progress.add_task("Building Docker image", total=None)
|
|
124
|
+
build_status = client.api.build(
|
|
125
|
+
path=os.path.dirname(path), dockerfile=path, tag=tag, decode=True, quiet=False
|
|
126
|
+
)
|
|
127
|
+
for line in build_status:
|
|
128
|
+
if "error" in line:
|
|
129
|
+
progress.stop()
|
|
130
|
+
console.print(f"\n[red]{line}[/red]")
|
|
131
|
+
return
|
|
132
|
+
if "stream" in line and line["stream"].strip():
|
|
133
|
+
console.print(line["stream"].strip(), style="bright_black")
|
|
134
|
+
progress.update(build_task, description="Building Docker image")
|
|
135
|
+
|
|
136
|
+
progress.update(build_task, description="[green]Build completed![/green]")
|
|
137
|
+
|
|
138
|
+
push_task = progress.add_task("Pushing to registry", total=None)
|
|
139
|
+
console.print("\nPushing image...", style="bold blue")
|
|
140
|
+
push_status = client.api.push(tag, stream=True, decode=True)
|
|
141
|
+
for line in push_status:
|
|
142
|
+
if "error" in line:
|
|
143
|
+
progress.stop()
|
|
144
|
+
console.print(f"\n[red]{line}[/red]")
|
|
145
|
+
return
|
|
146
|
+
if "status" in line:
|
|
147
|
+
console.print(line["status"], style="bright_black")
|
|
148
|
+
progress.update(push_task, description="Pushing to registry")
|
|
149
|
+
|
|
150
|
+
progress.update(push_task, description="[green]Push completed![/green]")
|
|
151
|
+
|
|
152
|
+
console.print(f"\n✅ Image pushed to {tag}", style="bold green")
|
|
153
|
+
console.print(
|
|
154
|
+
"Soon you will be able to deploy this model to the Lightning Studio!",
|
|
155
|
+
)
|
|
156
|
+
# TODO: Deploy to the cloud
|
|
157
|
+
|
|
69
158
|
|
|
70
159
|
class _Docker:
|
|
71
160
|
"""Generate a Dockerfile for a LitServe model."""
|
|
72
161
|
|
|
73
|
-
def api(self, server_filename: str, port: int = 8000, gpu: bool = False) ->
|
|
162
|
+
def api(self, server_filename: str, port: int = 8000, gpu: bool = False, tag: str = "litserve-model") -> str:
|
|
74
163
|
"""Generate a Dockerfile for the given server code.
|
|
75
164
|
|
|
76
|
-
Example:
|
|
77
|
-
lightning litserve dockerize server.py --port 8000 --gpu
|
|
78
|
-
|
|
79
165
|
Args:
|
|
80
|
-
server_filename
|
|
81
|
-
port
|
|
82
|
-
gpu
|
|
166
|
+
server_filename: The path to the server file. Example sever.py or app.py.
|
|
167
|
+
port: The port to expose in the Docker container.
|
|
168
|
+
gpu: Whether to use a GPU-enabled Docker image.
|
|
169
|
+
tag: Docker image tag to use in examples.
|
|
83
170
|
"""
|
|
84
171
|
import litserve as ls
|
|
85
172
|
from litserve import docker_builder
|
|
@@ -101,10 +188,10 @@ class _Docker:
|
|
|
101
188
|
|
|
102
189
|
version = ls.__version__
|
|
103
190
|
if gpu:
|
|
104
|
-
run_cmd = f"docker run --gpus all -p {port}:{port}
|
|
191
|
+
run_cmd = f"docker run --gpus all -p {port}:{port} {tag}:latest"
|
|
105
192
|
docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
|
|
106
193
|
else:
|
|
107
|
-
run_cmd = f"docker run -p {port}:{port}
|
|
194
|
+
run_cmd = f"docker run -p {port}:{port} {tag}:latest"
|
|
108
195
|
docker_template = docker_builder.DOCKERFILE_TEMPLATE
|
|
109
196
|
dockerfile_content = docker_template.format(
|
|
110
197
|
server_filename=server_filename,
|
|
@@ -119,12 +206,13 @@ class _Docker:
|
|
|
119
206
|
Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
|
|
120
207
|
|
|
121
208
|
[bold]Build the container with:[/bold]
|
|
122
|
-
> [underline]docker build -t
|
|
209
|
+
> [underline]docker build -t {tag} .[/underline]
|
|
123
210
|
|
|
124
211
|
[bold]To run the Docker container on the machine:[/bold]
|
|
125
212
|
> [underline]{run_cmd}[/underline]
|
|
126
213
|
|
|
127
214
|
[bold]To push the container to a registry:[/bold]
|
|
128
|
-
> [underline]docker push
|
|
215
|
+
> [underline]docker push {tag}[/underline]
|
|
129
216
|
"""
|
|
130
217
|
console.print(success_msg)
|
|
218
|
+
return os.path.abspath("Dockerfile")
|
lightning_sdk/job/job.py
CHANGED
|
@@ -25,6 +25,8 @@ def _has_jobs_v2() -> bool:
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class Job(_BaseJob):
|
|
28
|
+
"""Class to submit and manage single-machine jobs on the Lightning AI Platform."""
|
|
29
|
+
|
|
28
30
|
def __init__(
|
|
29
31
|
self,
|
|
30
32
|
name: str,
|
|
@@ -34,6 +36,15 @@ class Job(_BaseJob):
|
|
|
34
36
|
*,
|
|
35
37
|
_fetch_job: bool = True,
|
|
36
38
|
) -> None:
|
|
39
|
+
"""Fetch already existing jobs.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
name: the name of the job
|
|
43
|
+
teamspace: the teamspace the job is part of
|
|
44
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
45
|
+
user: the name of the user owning the :param`teamspace`
|
|
46
|
+
in case it is owned directly by a user instead of an org.
|
|
47
|
+
"""
|
|
37
48
|
internal_job_cls = _JobV2 if _has_jobs_v2() else _JobV1
|
|
38
49
|
|
|
39
50
|
self._internal_job = internal_job_cls(
|
lightning_sdk/mmt/__init__.py
CHANGED
lightning_sdk/mmt/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, Optional, Protocol, Tuple, Union
|
|
3
3
|
|
|
4
4
|
if TYPE_CHECKING:
|
|
5
5
|
from lightning_sdk.machine import Machine
|
|
@@ -10,11 +10,36 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from lightning_sdk.user import User
|
|
11
11
|
|
|
12
12
|
from lightning_sdk.job.base import _BaseJob
|
|
13
|
-
from lightning_sdk.job.job import Job
|
|
14
13
|
from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
|
|
15
14
|
|
|
16
15
|
|
|
16
|
+
class MMTMachine(Protocol):
|
|
17
|
+
"""A single machine in multi-machine training."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def name(self) -> str:
|
|
21
|
+
"""The Name of the individual machine. Usually corresponds to the rank."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def machine(self) -> "Machine":
|
|
26
|
+
"""The actual machine type this node is running on."""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def artifact_path(self) -> Optional[str]:
|
|
31
|
+
"""The path to the artifacts of this job."""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def status(self) -> "Status":
|
|
36
|
+
"""The status of this job."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
17
40
|
class _BaseMMT(_BaseJob):
|
|
41
|
+
"""Base interface to all job types."""
|
|
42
|
+
|
|
18
43
|
@classmethod
|
|
19
44
|
def run(
|
|
20
45
|
cls,
|
|
@@ -36,6 +61,39 @@ class _BaseMMT(_BaseJob):
|
|
|
36
61
|
artifacts_remote: Optional[str] = None,
|
|
37
62
|
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
38
63
|
) -> "_BaseMMT":
|
|
64
|
+
"""Run async workloads using a docker image across multiple machines.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
68
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
69
|
+
num_machine: The number of machines to run on.
|
|
70
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
71
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
72
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
73
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
74
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
75
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
76
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
77
|
+
cloud_account: The cloud account to run the job on.
|
|
78
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
79
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
80
|
+
env: Environment variables to set inside the job.
|
|
81
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
82
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
83
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
84
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
85
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
86
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
87
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
88
|
+
Only supported for jobs with a docker image compute environment.
|
|
89
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
90
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
91
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
92
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
93
|
+
within it.
|
|
94
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
95
|
+
Only supported for jobs with a docker image compute environment.
|
|
96
|
+
"""
|
|
39
97
|
from lightning_sdk.studio import Studio
|
|
40
98
|
|
|
41
99
|
cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
|
|
@@ -134,53 +192,87 @@ class _BaseMMT(_BaseJob):
|
|
|
134
192
|
artifacts_local: Optional[str] = None,
|
|
135
193
|
artifacts_remote: Optional[str] = None,
|
|
136
194
|
) -> None:
|
|
137
|
-
"""
|
|
195
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
num_machines: The number of machines to run on.
|
|
199
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
200
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
201
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
202
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
203
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
204
|
+
env: Environment variables to set inside the job.
|
|
205
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
206
|
+
cloud_account: The cloud account to run the job on.
|
|
207
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
208
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
209
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
210
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
211
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
212
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
213
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
214
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
215
|
+
Only supported for jobs with a docker image compute environment.
|
|
216
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
217
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
218
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
219
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
220
|
+
within it.
|
|
221
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
222
|
+
Only supported for jobs with a docker image compute environment.
|
|
223
|
+
"""
|
|
138
224
|
|
|
139
225
|
@property
|
|
140
226
|
@abstractmethod
|
|
141
|
-
def machines(self) -> Tuple[
|
|
142
|
-
|
|
227
|
+
def machines(self) -> Tuple[MMTMachine, ...]:
|
|
228
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
143
229
|
|
|
144
230
|
@property
|
|
145
231
|
@abstractmethod
|
|
146
232
|
def machine(self) -> "Machine":
|
|
147
|
-
|
|
233
|
+
"""Returns the machine type this job is running on."""
|
|
148
234
|
|
|
149
235
|
@abstractmethod
|
|
150
236
|
def stop(self) -> None:
|
|
151
|
-
|
|
237
|
+
"""Stops the job."""
|
|
152
238
|
|
|
153
239
|
@abstractmethod
|
|
154
240
|
def delete(self) -> None:
|
|
155
|
-
|
|
241
|
+
"""Deletes the job.
|
|
242
|
+
|
|
243
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
244
|
+
"""
|
|
156
245
|
|
|
157
246
|
@property
|
|
158
247
|
@abstractmethod
|
|
159
248
|
def status(self) -> "Status":
|
|
160
|
-
|
|
249
|
+
"""The current status of the job."""
|
|
161
250
|
|
|
162
251
|
@property
|
|
163
252
|
@abstractmethod
|
|
164
253
|
def artifact_path(self) -> Optional[str]:
|
|
165
|
-
|
|
254
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
166
255
|
|
|
167
256
|
@property
|
|
168
257
|
@abstractmethod
|
|
169
258
|
def snapshot_path(self) -> Optional[str]:
|
|
170
|
-
|
|
259
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
171
260
|
|
|
172
261
|
@property
|
|
173
262
|
def share_path(self) -> Optional[str]:
|
|
263
|
+
"""Path to the jobs share path."""
|
|
174
264
|
return None
|
|
175
265
|
|
|
176
|
-
@abstractmethod
|
|
177
|
-
def _update_internal_job(self) -> None:
|
|
178
|
-
pass
|
|
179
|
-
|
|
180
266
|
@property
|
|
181
267
|
def name(self) -> str:
|
|
268
|
+
"""The job's name."""
|
|
182
269
|
return self._name
|
|
183
270
|
|
|
184
271
|
@property
|
|
185
272
|
def teamspace(self) -> "Teamspace":
|
|
273
|
+
"""The teamspace the job is part of."""
|
|
186
274
|
return self._teamspace
|
|
275
|
+
|
|
276
|
+
@abstractmethod
|
|
277
|
+
def _update_internal_job(self) -> None:
|
|
278
|
+
pass
|
lightning_sdk/mmt/mmt.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
from
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
3
|
|
|
4
|
+
from lightning_sdk.api.user_api import UserApi
|
|
3
5
|
from lightning_sdk.job.job import _has_jobs_v2
|
|
4
|
-
from lightning_sdk.mmt.base import _BaseMMT
|
|
6
|
+
from lightning_sdk.mmt.base import MMTMachine, _BaseMMT
|
|
5
7
|
from lightning_sdk.mmt.v1 import _MMTV1
|
|
6
8
|
from lightning_sdk.mmt.v2 import _MMTV2
|
|
7
9
|
|
|
@@ -14,27 +16,22 @@ if TYPE_CHECKING:
|
|
|
14
16
|
from lightning_sdk.user import User
|
|
15
17
|
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
+
@lru_cache(maxsize=None)
|
|
20
|
+
def _has_mmt_v2() -> bool:
|
|
21
|
+
# users need both mmtv2 and jobsv2 flags in order for mmtv2 to work correctly
|
|
22
|
+
if not _has_jobs_v2():
|
|
23
|
+
return False
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def machine(self) -> "Machine":
|
|
26
|
-
...
|
|
27
|
-
|
|
28
|
-
@property
|
|
29
|
-
def artifact_path(self) -> Optional[str]:
|
|
30
|
-
...
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
def status(self) -> "Status":
|
|
34
|
-
...
|
|
25
|
+
api = UserApi()
|
|
26
|
+
try:
|
|
27
|
+
return api._get_feature_flags().mmt_v2
|
|
28
|
+
except Exception:
|
|
29
|
+
return False
|
|
35
30
|
|
|
36
31
|
|
|
37
32
|
class MMT(_BaseMMT):
|
|
33
|
+
"""Class to submit and manage multi-machine jobs on the Lightning AI Platform."""
|
|
34
|
+
|
|
38
35
|
_force_v1: (
|
|
39
36
|
bool
|
|
40
37
|
) = False # required for studio plugin still working correctly as v2 currently does not support the studio env
|
|
@@ -48,7 +45,16 @@ class MMT(_BaseMMT):
|
|
|
48
45
|
*,
|
|
49
46
|
_fetch_job: bool = True,
|
|
50
47
|
) -> None:
|
|
51
|
-
|
|
48
|
+
"""Fetch already existing jobs.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
name: the name of the job
|
|
52
|
+
teamspace: the teamspace the job is part of
|
|
53
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
54
|
+
user: the name of the user owning the :param`teamspace`
|
|
55
|
+
in case it is owned directly by a user instead of an org.
|
|
56
|
+
"""
|
|
57
|
+
internal_mmt_cls = _MMTV2 if _has_mmt_v2() and not self._force_v1 else _MMTV1
|
|
52
58
|
|
|
53
59
|
self._internal_mmt = internal_mmt_cls(
|
|
54
60
|
name=name,
|
|
@@ -79,6 +85,39 @@ class MMT(_BaseMMT):
|
|
|
79
85
|
artifacts_remote: Optional[str] = None,
|
|
80
86
|
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
81
87
|
) -> "MMT":
|
|
88
|
+
"""Run async workloads using a docker image across multiple machines.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
92
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
93
|
+
num_machine: The number of machines to run on.
|
|
94
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
95
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
96
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
97
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
98
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
99
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
100
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
101
|
+
cloud_account: The cloud account to run the job on.
|
|
102
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
103
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
104
|
+
env: Environment variables to set inside the job.
|
|
105
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
106
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
107
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
108
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
109
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
110
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
111
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
112
|
+
Only supported for jobs with a docker image compute environment.
|
|
113
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
114
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
115
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
116
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
117
|
+
within it.
|
|
118
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
119
|
+
Only supported for jobs with a docker image compute environment.
|
|
120
|
+
"""
|
|
82
121
|
ret_val = super().run(
|
|
83
122
|
name=name,
|
|
84
123
|
num_machines=num_machines,
|
|
@@ -117,6 +156,35 @@ class MMT(_BaseMMT):
|
|
|
117
156
|
artifacts_local: Optional[str] = None,
|
|
118
157
|
artifacts_remote: Optional[str] = None,
|
|
119
158
|
) -> "MMT":
|
|
159
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
num_machines: The number of machines to run on.
|
|
163
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
164
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
165
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
166
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
167
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
168
|
+
env: Environment variables to set inside the job.
|
|
169
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
170
|
+
cloud_account: The cloud account to run the job on.
|
|
171
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
172
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
173
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
174
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
175
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
176
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
177
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
178
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
179
|
+
Only supported for jobs with a docker image compute environment.
|
|
180
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
181
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
182
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
183
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
184
|
+
within it.
|
|
185
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
186
|
+
Only supported for jobs with a docker image compute environment.
|
|
187
|
+
"""
|
|
120
188
|
self._job = self._internal_mmt._submit(
|
|
121
189
|
num_machines=num_machines,
|
|
122
190
|
machine=machine,
|
|
@@ -134,33 +202,44 @@ class MMT(_BaseMMT):
|
|
|
134
202
|
return self
|
|
135
203
|
|
|
136
204
|
def stop(self) -> None:
|
|
205
|
+
"""Stops the job."""
|
|
137
206
|
return self._internal_mmt.stop()
|
|
138
207
|
|
|
139
208
|
def delete(self) -> None:
|
|
209
|
+
"""Deletes the job.
|
|
210
|
+
|
|
211
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
212
|
+
"""
|
|
140
213
|
return self._internal_mmt.delete()
|
|
141
214
|
|
|
142
215
|
@property
|
|
143
216
|
def status(self) -> "Status":
|
|
217
|
+
"""The current status of the job (accumulated over all machines)."""
|
|
144
218
|
return self._internal_mmt.status
|
|
145
219
|
|
|
146
220
|
@property
|
|
147
221
|
def machines(self) -> Tuple[MMTMachine, ...]:
|
|
222
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
148
223
|
return self._internal_mmt.machines
|
|
149
224
|
|
|
150
225
|
@property
|
|
151
226
|
def machine(self) -> "Machine":
|
|
227
|
+
"""Returns the machine type this job is running on."""
|
|
152
228
|
return self._internal_mmt.machine
|
|
153
229
|
|
|
154
230
|
@property
|
|
155
231
|
def artifact_path(self) -> Optional[str]:
|
|
232
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
156
233
|
return self._internal_mmt.artifact_path
|
|
157
234
|
|
|
158
235
|
@property
|
|
159
236
|
def snapshot_path(self) -> Optional[str]:
|
|
237
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
160
238
|
return self._internal_mmt.snapshot_path
|
|
161
239
|
|
|
162
240
|
@property
|
|
163
241
|
def share_path(self) -> Optional[str]:
|
|
242
|
+
"""Path to the jobs share path."""
|
|
164
243
|
return None
|
|
165
244
|
|
|
166
245
|
def _update_internal_job(self) -> None:
|
|
@@ -168,16 +247,14 @@ class MMT(_BaseMMT):
|
|
|
168
247
|
|
|
169
248
|
@property
|
|
170
249
|
def name(self) -> str:
|
|
250
|
+
"""The job's name."""
|
|
171
251
|
return self._internal_mmt.name
|
|
172
252
|
|
|
173
253
|
@property
|
|
174
254
|
def teamspace(self) -> "Teamspace":
|
|
255
|
+
"""The teamspace the job is part of."""
|
|
175
256
|
return self._internal_mmt._teamspace
|
|
176
257
|
|
|
177
|
-
@property
|
|
178
|
-
def cloud_account(self) -> Optional[str]:
|
|
179
|
-
return self._internal_mmt.cloud_account
|
|
180
|
-
|
|
181
258
|
def __getattr__(self, key: str) -> Any:
|
|
182
259
|
"""Forward the attribute lookup to the internal job implementation."""
|
|
183
260
|
try:
|
lightning_sdk/mmt/v1.py
CHANGED
|
@@ -16,6 +16,8 @@ from lightning_sdk.mmt.base import _BaseMMT
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class _MMTV1(_BaseMMT):
|
|
19
|
+
"""V1 Implementation of Multi-Machine Training."""
|
|
20
|
+
|
|
19
21
|
def __init__(
|
|
20
22
|
self,
|
|
21
23
|
name: str,
|
|
@@ -25,6 +27,15 @@ class _MMTV1(_BaseMMT):
|
|
|
25
27
|
*,
|
|
26
28
|
_fetch_job: bool = True,
|
|
27
29
|
) -> None:
|
|
30
|
+
"""Fetch already existing jobs.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: the name of the job
|
|
34
|
+
teamspace: the teamspace the job is part of
|
|
35
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
36
|
+
user: the name of the user owning the :param`teamspace`
|
|
37
|
+
in case it is owned directly by a user instead of an org.
|
|
38
|
+
"""
|
|
28
39
|
self._job_api = MMTApiV1()
|
|
29
40
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
30
41
|
|
|
@@ -43,6 +54,35 @@ class _MMTV1(_BaseMMT):
|
|
|
43
54
|
artifacts_local: Optional[str] = None,
|
|
44
55
|
artifacts_remote: Optional[str] = None,
|
|
45
56
|
) -> "_MMTV1":
|
|
57
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
num_machines: The number of machines to run on.
|
|
61
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
62
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
63
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
64
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
65
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
66
|
+
env: Environment variables to set inside the job.
|
|
67
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
68
|
+
cloud_account: The cloud account to run the job on.
|
|
69
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
70
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
71
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
72
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
73
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
74
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
75
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
76
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
77
|
+
Only supported for jobs with a docker image compute environment.
|
|
78
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
79
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
80
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
81
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
82
|
+
within it.
|
|
83
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
84
|
+
Only supported for jobs with a docker image compute environment.
|
|
85
|
+
"""
|
|
46
86
|
if studio is None:
|
|
47
87
|
raise ValueError("Studio is required for submitting jobs")
|
|
48
88
|
if image is not None or image_credentials is not None or cloud_account_auth:
|
|
@@ -80,18 +120,25 @@ class _MMTV1(_BaseMMT):
|
|
|
80
120
|
|
|
81
121
|
@property
|
|
82
122
|
def machines(self) -> Tuple["Work", ...]:
|
|
123
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
83
124
|
works = self._job_api.list_works(self._guaranteed_job.id, self.teamspace.id)
|
|
84
125
|
|
|
85
126
|
return tuple(Work(w.id, self, self.teamspace) for w in works)
|
|
86
127
|
|
|
87
128
|
def stop(self) -> None:
|
|
129
|
+
"""Stops the job."""
|
|
88
130
|
self._job_api.stop_job(self._guaranteed_job.id, self.teamspace.id)
|
|
89
131
|
|
|
90
132
|
def delete(self) -> None:
|
|
133
|
+
"""Deletes the job.
|
|
134
|
+
|
|
135
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
136
|
+
"""
|
|
91
137
|
self._job_api.delete_job(self._guaranteed_job.id, self.teamspace.id)
|
|
92
138
|
|
|
93
139
|
@property
|
|
94
140
|
def status(self) -> "Status":
|
|
141
|
+
"""The current status of the job."""
|
|
95
142
|
try:
|
|
96
143
|
status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
|
|
97
144
|
return _internal_status_to_external_status(status)
|
|
@@ -102,22 +149,27 @@ class _MMTV1(_BaseMMT):
|
|
|
102
149
|
|
|
103
150
|
@property
|
|
104
151
|
def artifact_path(self) -> Optional[str]:
|
|
152
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
105
153
|
return f"/teamspace/jobs/{self.name}"
|
|
106
154
|
|
|
107
155
|
@property
|
|
108
156
|
def snapshot_path(self) -> Optional[str]:
|
|
157
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
109
158
|
return f"/teamspace/jobs/{self.name}/snapshot"
|
|
110
159
|
|
|
111
160
|
@property
|
|
112
161
|
def machine(self) -> "Machine":
|
|
162
|
+
"""Returns the machine type this job is running on."""
|
|
113
163
|
return self.machines[0].machine
|
|
114
164
|
|
|
115
165
|
@property
|
|
116
166
|
def name(self) -> str:
|
|
167
|
+
"""The job's name."""
|
|
117
168
|
return self._name
|
|
118
169
|
|
|
119
170
|
@property
|
|
120
171
|
def teamspace(self) -> "Teamspace":
|
|
172
|
+
"""The teamspace the job is part of."""
|
|
121
173
|
return self._teamspace
|
|
122
174
|
|
|
123
175
|
# the following and functions are solely to make the Work class function
|
lightning_sdk/mmt/v2.py
CHANGED
|
@@ -15,6 +15,8 @@ from lightning_sdk.mmt.base import _BaseMMT
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class _MMTV2(_BaseMMT):
|
|
18
|
+
"""New implementation of Multi-Machine Training."""
|
|
19
|
+
|
|
18
20
|
def __init__(
|
|
19
21
|
self,
|
|
20
22
|
name: str,
|
|
@@ -24,6 +26,15 @@ class _MMTV2(_BaseMMT):
|
|
|
24
26
|
*,
|
|
25
27
|
_fetch_job: bool = True,
|
|
26
28
|
) -> None:
|
|
29
|
+
"""Fetch already existing jobs.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: the name of the job
|
|
33
|
+
teamspace: the teamspace the job is part of
|
|
34
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
35
|
+
user: the name of the user owning the :param`teamspace`
|
|
36
|
+
in case it is owned directly by a user instead of an org.
|
|
37
|
+
"""
|
|
27
38
|
self._job_api = MMTApiV2()
|
|
28
39
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
29
40
|
|
|
@@ -42,6 +53,35 @@ class _MMTV2(_BaseMMT):
|
|
|
42
53
|
artifacts_local: Optional[str] = None,
|
|
43
54
|
artifacts_remote: Optional[str] = None,
|
|
44
55
|
) -> "_MMTV2":
|
|
56
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
num_machines: The number of machines to run on.
|
|
60
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
61
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
62
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
63
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
64
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
65
|
+
env: Environment variables to set inside the job.
|
|
66
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
67
|
+
cloud_account: The cloud account to run the job on.
|
|
68
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
69
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
70
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
71
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
72
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
73
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
74
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
75
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
76
|
+
Only supported for jobs with a docker image compute environment.
|
|
77
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
78
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
79
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
80
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
81
|
+
within it.
|
|
82
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
83
|
+
Only supported for jobs with a docker image compute environment.
|
|
84
|
+
"""
|
|
45
85
|
# Command is required if Studio is provided to know what to run
|
|
46
86
|
# Image is mutually exclusive with Studio
|
|
47
87
|
# Command is optional for Image
|
|
@@ -80,6 +120,7 @@ class _MMTV2(_BaseMMT):
|
|
|
80
120
|
|
|
81
121
|
@property
|
|
82
122
|
def machines(self) -> Tuple["Job", ...]:
|
|
123
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
83
124
|
from lightning_sdk.job import Job
|
|
84
125
|
|
|
85
126
|
return tuple(
|
|
@@ -88,9 +129,14 @@ class _MMTV2(_BaseMMT):
|
|
|
88
129
|
)
|
|
89
130
|
|
|
90
131
|
def stop(self) -> None:
|
|
132
|
+
"""Stops the job."""
|
|
91
133
|
self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
|
|
92
134
|
|
|
93
135
|
def delete(self) -> None:
|
|
136
|
+
"""Deletes the job.
|
|
137
|
+
|
|
138
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
139
|
+
"""
|
|
94
140
|
self._job_api.delete_job(
|
|
95
141
|
job_id=self._guaranteed_job.id,
|
|
96
142
|
teamspace_id=self._teamspace.id,
|
|
@@ -104,20 +150,24 @@ class _MMTV2(_BaseMMT):
|
|
|
104
150
|
|
|
105
151
|
@property
|
|
106
152
|
def status(self) -> "Status":
|
|
153
|
+
"""The current status of the job."""
|
|
107
154
|
return self._job_api._job_state_to_external(self._latest_job.state)
|
|
108
155
|
|
|
109
156
|
@property
|
|
110
157
|
def artifact_path(self) -> Optional[str]:
|
|
158
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
111
159
|
# TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
|
|
112
160
|
raise NotImplementedError
|
|
113
161
|
|
|
114
162
|
@property
|
|
115
163
|
def snapshot_path(self) -> Optional[str]:
|
|
164
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
116
165
|
# TODO: Since grouping for those is not done yet on the BE, we cannot yet have a unified link here
|
|
117
166
|
raise NotImplementedError
|
|
118
167
|
|
|
119
168
|
@property
|
|
120
169
|
def machine(self) -> "Machine":
|
|
170
|
+
"""Returns the machine type this job is running on."""
|
|
121
171
|
return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
|
|
122
172
|
|
|
123
173
|
def _update_internal_job(self) -> None:
|
|
@@ -129,8 +179,10 @@ class _MMTV2(_BaseMMT):
|
|
|
129
179
|
|
|
130
180
|
@property
|
|
131
181
|
def name(self) -> str:
|
|
182
|
+
"""The job's name."""
|
|
132
183
|
return self._name
|
|
133
184
|
|
|
134
185
|
@property
|
|
135
186
|
def teamspace(self) -> "Teamspace":
|
|
187
|
+
"""The teamspace the job is part of."""
|
|
136
188
|
return self._teamspace
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: lightning_sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.43
|
|
4
4
|
Summary: SDK to develop using Lightning AI Studios
|
|
5
5
|
Author-email: Lightning-AI <justus@lightning.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -47,6 +47,7 @@ Requires-Dist: simple-term-menu
|
|
|
47
47
|
Requires-Dist: lightning-utilities
|
|
48
48
|
Provides-Extra: serve
|
|
49
49
|
Requires-Dist: litserve>=0.2.5; extra == "serve"
|
|
50
|
+
Requires-Dist: docker; extra == "serve"
|
|
50
51
|
|
|
51
52
|
# Lightning SDK
|
|
52
53
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
docs/source/conf.py,sha256=r8yX20eC-4mHhMTd0SbQb5TlSWHhO6wnJ0VJ_FBFpag,13249
|
|
2
|
-
lightning_sdk/__init__.py,sha256=
|
|
2
|
+
lightning_sdk/__init__.py,sha256=je3-DFVBfoAqqPLrFLZmVZGWPNda7886O9-BrY6dvlQ,925
|
|
3
3
|
lightning_sdk/agents.py,sha256=ly6Ma1j0ZgGPFyvPvMN28JWiB9dATIstFa5XM8pMi6I,1577
|
|
4
4
|
lightning_sdk/ai_hub.py,sha256=kBjtmrzVHPCgqtV_TrSNkuf4oT2DLm8SYRTz4iTQmmY,6624
|
|
5
5
|
lightning_sdk/constants.py,sha256=ztl1PTUBULnqTf3DyKUSJaV_O20hNtUYT6XvAYIrmIk,749
|
|
@@ -30,16 +30,15 @@ lightning_sdk/cli/download.py,sha256=nyQN3q1vZ0fg4_cfit8cKaokQ9VUd46l_TNcAQWkLwU
|
|
|
30
30
|
lightning_sdk/cli/entrypoint.py,sha256=Hl2Lm7-OS0kx_pyJyGe7Nii0Soc6HYe4r4xXKeJuC_o,1507
|
|
31
31
|
lightning_sdk/cli/exceptions.py,sha256=QUF3OMAMZwBikvlusimSHSBjb6ywvHpfAumJBEaodSw,169
|
|
32
32
|
lightning_sdk/cli/legacy.py,sha256=ocTVNwlsLRS5aMjbMkwFPjT3uEYvS8C40CJ0PeRRv8g,4707
|
|
33
|
-
lightning_sdk/cli/
|
|
34
|
-
lightning_sdk/cli/
|
|
35
|
-
lightning_sdk/cli/serve.py,sha256=dfhbxNscaDJijJSXxpqRKZoI-eGvaIVWKoqTsg_xZWk,4619
|
|
33
|
+
lightning_sdk/cli/run.py,sha256=B6ttd9SKg373ngug-lj74CEcuEoxwz-P6nUBVnQeijI,10836
|
|
34
|
+
lightning_sdk/cli/serve.py,sha256=UaXhGHU6nbAzrnVigSKOTrMjLwSs-sjyhuJCdVUBwzc,8722
|
|
36
35
|
lightning_sdk/cli/studios_menu.py,sha256=0kQGqGel8gAbpdJtjOM1a6NEat_TnIqRNprNn8QiK58,3236
|
|
37
36
|
lightning_sdk/cli/upload.py,sha256=H9OyipYTYAQ9Mzy2e8jtoaa-B34-uXHbTQTzY2Vmhv4,9078
|
|
38
37
|
lightning_sdk/deployment/__init__.py,sha256=BLu7_cVLp97TYxe6qe-J1zKUSZXAVcvCjgcA7plV2k4,497
|
|
39
38
|
lightning_sdk/deployment/deployment.py,sha256=Dp15pn8rFAfMfaDhKn0v3bphFuvLgkPFs3KSNxW6eyc,15472
|
|
40
39
|
lightning_sdk/job/__init__.py,sha256=1MxjQ6rHkyUHCypSW9RuXuVMVH11WiqhIXcU2LCFMwE,64
|
|
41
40
|
lightning_sdk/job/base.py,sha256=I4-iWyiKp1KUkxDy97zJYbwbdQ_7cu6FqCayKwXDloQ,13000
|
|
42
|
-
lightning_sdk/job/job.py,sha256=
|
|
41
|
+
lightning_sdk/job/job.py,sha256=pT9rkSsK5BHu6dSyHpAlYfGXvaF0s_XtrjjhOSAVFWU,11070
|
|
43
42
|
lightning_sdk/job/v1.py,sha256=zIcngaM2_c-2thcKiCOPoWGUuIKc4tasJEwEFJv6bAA,8953
|
|
44
43
|
lightning_sdk/job/v2.py,sha256=oq54VFInuVV_L-nUO_dnBbn4TxPWiBuIqmdFpNN1LmU,8057
|
|
45
44
|
lightning_sdk/job/work.py,sha256=PYopS_6c556I2o8ouSXmzb4FGQflzCe06GpqJiCdedw,1604
|
|
@@ -834,11 +833,11 @@ lightning_sdk/lightning_cloud/utils/data_connection.py,sha256=VN-Gs0a4g3tA9TQCwP
|
|
|
834
833
|
lightning_sdk/lightning_cloud/utils/dataset.py,sha256=4nUspe8iAaRPgSYpXA2uAQCgydm78kJzhOIx3C9qKls,2011
|
|
835
834
|
lightning_sdk/lightning_cloud/utils/name_generator.py,sha256=MkciuA10332V0mcE2PxLIiwWomWE0Fm_gNGK01vwRr4,58046
|
|
836
835
|
lightning_sdk/lightning_cloud/utils/network.py,sha256=axPgl8rhyPcPjxiztDxyksfxax3VNg2OXL5F5Uc81b4,406
|
|
837
|
-
lightning_sdk/mmt/__init__.py,sha256
|
|
838
|
-
lightning_sdk/mmt/base.py,sha256=
|
|
839
|
-
lightning_sdk/mmt/mmt.py,sha256=
|
|
840
|
-
lightning_sdk/mmt/v1.py,sha256=
|
|
841
|
-
lightning_sdk/mmt/v2.py,sha256=
|
|
836
|
+
lightning_sdk/mmt/__init__.py,sha256=ExMu90-96bGBnyp5h0CErQszUGB1-PcjC4-R8_NYbeY,117
|
|
837
|
+
lightning_sdk/mmt/base.py,sha256=lUB8pAKXTjn_WD6vcJIUMxtZQwrzwxYnaPJbaiPPtMw,12548
|
|
838
|
+
lightning_sdk/mmt/mmt.py,sha256=vMAoR3qjb0VRwCXpKpAMxeTEMALmQB9_RK064fTHM8M,11922
|
|
839
|
+
lightning_sdk/mmt/v1.py,sha256=8LjZnMSGgsGLeajuPto3gknJwVRvsGfkYVoo5A_UlO8,7917
|
|
840
|
+
lightning_sdk/mmt/v2.py,sha256=TsJ8PNluyU0WnNdHvEKWkpJxMwjmyFFJZIaJKCgugIM,8201
|
|
842
841
|
lightning_sdk/services/__init__.py,sha256=gSWUjccEhMI9CIWL_nbrFHUK2S6TM2725mEzrLMfK1Y,225
|
|
843
842
|
lightning_sdk/services/file_endpoint.py,sha256=we5HC_o74J4Y6fSP_31jIizi_I_1FO_Rb2qblspD9eE,7855
|
|
844
843
|
lightning_sdk/services/utilities.py,sha256=IeOx8hc3F8ZevHeKBysh08BXhJliTNzvKp1gwpEfdik,4087
|
|
@@ -847,9 +846,9 @@ lightning_sdk/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
|
847
846
|
lightning_sdk/utils/dynamic.py,sha256=glUTO1JC9APtQ6Gr9SO02a3zr56-sPAXM5C3NrTpgyQ,1959
|
|
848
847
|
lightning_sdk/utils/enum.py,sha256=h2JRzqoBcSlUdanFHmkj_j5DleBHAu1esQYUsdNI-hU,4106
|
|
849
848
|
lightning_sdk/utils/resolve.py,sha256=RWvlOWLHjaHhR0W0zT3mN719cbzhFfYCKBss38zfv3k,5783
|
|
850
|
-
lightning_sdk-0.1.
|
|
851
|
-
lightning_sdk-0.1.
|
|
852
|
-
lightning_sdk-0.1.
|
|
853
|
-
lightning_sdk-0.1.
|
|
854
|
-
lightning_sdk-0.1.
|
|
855
|
-
lightning_sdk-0.1.
|
|
849
|
+
lightning_sdk-0.1.43.dist-info/LICENSE,sha256=uFIuZwj5z-4TeF2UuacPZ1o17HkvKObT8fY50qN84sg,1064
|
|
850
|
+
lightning_sdk-0.1.43.dist-info/METADATA,sha256=qsepNbeCeaApc3hDy1vhjhXs-sIoKUwYfMVK_MaKVUA,4031
|
|
851
|
+
lightning_sdk-0.1.43.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
852
|
+
lightning_sdk-0.1.43.dist-info/entry_points.txt,sha256=msB9PJWIJ784dX-OP8by51d4IbKYH3Fj1vCuA9oXjHY,68
|
|
853
|
+
lightning_sdk-0.1.43.dist-info/top_level.txt,sha256=ps8doKILFXmN7F1mHncShmnQoTxKBRPIcchC8TpoBw4,19
|
|
854
|
+
lightning_sdk-0.1.43.dist-info/RECORD,,
|
lightning_sdk/cli/mmt.py
DELETED
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Optional
|
|
2
|
-
|
|
3
|
-
from fire import Fire
|
|
4
|
-
|
|
5
|
-
from lightning_sdk._mmt import MMT
|
|
6
|
-
from lightning_sdk.api.studio_api import _cloud_url
|
|
7
|
-
from lightning_sdk.lightning_cloud.login import Auth
|
|
8
|
-
from lightning_sdk.machine import Machine
|
|
9
|
-
from lightning_sdk.teamspace import Teamspace
|
|
10
|
-
|
|
11
|
-
_MACHINE_VALUES = tuple([machine.value for machine in Machine])
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MMTCLI:
|
|
15
|
-
"""Command line interface (CLI) to interact with/manage Lightning AI MMT."""
|
|
16
|
-
|
|
17
|
-
def __init__(self) -> None:
|
|
18
|
-
# Need to set the docstring here for f-strings to work.
|
|
19
|
-
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
20
|
-
# and fire does not show values for literals, just that it is a literal.
|
|
21
|
-
docstr = f"""Run async workloads on multiple machines using a docker image.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
name: The name of the job. Needs to be unique within the teamspace.
|
|
25
|
-
num_machines: The number of Machines to run on. Defaults to 2 Machines
|
|
26
|
-
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
|
|
27
|
-
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
28
|
-
If not provided for images, will run the container entrypoint and default command.
|
|
29
|
-
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
30
|
-
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
31
|
-
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
32
|
-
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
33
|
-
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
34
|
-
cloud_account: The cloud account to run the job on.
|
|
35
|
-
Defaults to the studio cloud account if running with studio compute env.
|
|
36
|
-
If not provided will fall back to the teamspaces default cloud account.
|
|
37
|
-
env: Environment variables to set inside the job.
|
|
38
|
-
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
39
|
-
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
40
|
-
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
41
|
-
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
42
|
-
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
43
|
-
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
44
|
-
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
45
|
-
Only supported for jobs with a docker image compute environment.
|
|
46
|
-
artifacts_remote: The remote storage to persist your artifacts to.
|
|
47
|
-
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
48
|
-
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
49
|
-
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
50
|
-
within it.
|
|
51
|
-
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
52
|
-
Only supported for jobs with a docker image compute environment.
|
|
53
|
-
"""
|
|
54
|
-
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
55
|
-
# might need to switch to explicit cli definition
|
|
56
|
-
self.run.__func__.__doc__ = docstr
|
|
57
|
-
|
|
58
|
-
def login(self) -> None:
|
|
59
|
-
"""Login to Lightning AI Studios."""
|
|
60
|
-
auth = Auth()
|
|
61
|
-
auth.clear()
|
|
62
|
-
|
|
63
|
-
try:
|
|
64
|
-
auth.authenticate()
|
|
65
|
-
except ConnectionError:
|
|
66
|
-
raise RuntimeError(f"Unable to connect to {_cloud_url()}. Please check your internet connection.") from None
|
|
67
|
-
|
|
68
|
-
def logout(self) -> None:
|
|
69
|
-
"""Logout from Lightning AI Studios."""
|
|
70
|
-
auth = Auth()
|
|
71
|
-
auth.clear()
|
|
72
|
-
|
|
73
|
-
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
74
|
-
# see https://github.com/google/python-fire/pull/513
|
|
75
|
-
# might need to move to different cli library
|
|
76
|
-
def run(
|
|
77
|
-
self,
|
|
78
|
-
name: Optional[str] = None,
|
|
79
|
-
num_machines: int = 2,
|
|
80
|
-
machine: Optional[str] = None,
|
|
81
|
-
command: Optional[str] = None,
|
|
82
|
-
studio: Optional[str] = None,
|
|
83
|
-
image: Optional[str] = None,
|
|
84
|
-
teamspace: Optional[str] = None,
|
|
85
|
-
org: Optional[str] = None,
|
|
86
|
-
user: Optional[str] = None,
|
|
87
|
-
cloud_account: Optional[str] = None,
|
|
88
|
-
env: Optional[Dict[str, str]] = None,
|
|
89
|
-
interruptible: bool = False,
|
|
90
|
-
image_credentials: Optional[str] = None,
|
|
91
|
-
cloud_account_auth: bool = False,
|
|
92
|
-
artifacts_local: Optional[str] = None,
|
|
93
|
-
artifacts_remote: Optional[str] = None,
|
|
94
|
-
) -> None:
|
|
95
|
-
if name is None:
|
|
96
|
-
from datetime import datetime
|
|
97
|
-
|
|
98
|
-
timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
99
|
-
name = f"mmt-{timestr}"
|
|
100
|
-
|
|
101
|
-
if machine is None:
|
|
102
|
-
# TODO: infer from studio
|
|
103
|
-
machine = "CPU"
|
|
104
|
-
machine_enum = Machine(machine.upper())
|
|
105
|
-
|
|
106
|
-
teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
107
|
-
if cloud_account is None:
|
|
108
|
-
cloud_account = teamspace.default_cloud_account
|
|
109
|
-
|
|
110
|
-
if image is None:
|
|
111
|
-
raise RuntimeError("Currently only docker images are specified")
|
|
112
|
-
MMT.run(
|
|
113
|
-
name=name,
|
|
114
|
-
num_machines=num_machines,
|
|
115
|
-
machine=machine_enum,
|
|
116
|
-
command=command,
|
|
117
|
-
studio=studio,
|
|
118
|
-
image=image,
|
|
119
|
-
teamspace=teamspace,
|
|
120
|
-
org=org,
|
|
121
|
-
user=user,
|
|
122
|
-
cloud_account=cloud_account,
|
|
123
|
-
env=env,
|
|
124
|
-
interruptible=interruptible,
|
|
125
|
-
image_credentials=image_credentials,
|
|
126
|
-
cloud_account_auth=cloud_account_auth,
|
|
127
|
-
artifacts_local=artifacts_local,
|
|
128
|
-
artifacts_remote=artifacts_remote,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def main_cli() -> None:
|
|
133
|
-
"""CLI entrypoint."""
|
|
134
|
-
Fire(MMTCLI(), name="_mmt")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
if __name__ == "__main__":
|
|
138
|
-
main_cli()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|