lightning-sdk 0.1.58__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. lightning_sdk/__init__.py +5 -3
  2. lightning_sdk/api/deployment_api.py +23 -11
  3. lightning_sdk/api/job_api.py +42 -7
  4. lightning_sdk/api/lit_container_api.py +23 -3
  5. lightning_sdk/api/mmt_api.py +46 -8
  6. lightning_sdk/api/pipeline_api.py +50 -0
  7. lightning_sdk/api/teamspace_api.py +2 -2
  8. lightning_sdk/api/utils.py +15 -5
  9. lightning_sdk/cli/ai_hub.py +30 -65
  10. lightning_sdk/cli/coloring.py +60 -0
  11. lightning_sdk/cli/configure.py +25 -40
  12. lightning_sdk/cli/connect.py +7 -20
  13. lightning_sdk/cli/create.py +83 -0
  14. lightning_sdk/cli/delete.py +72 -75
  15. lightning_sdk/cli/docker.py +22 -0
  16. lightning_sdk/cli/download.py +78 -113
  17. lightning_sdk/cli/entrypoint.py +44 -65
  18. lightning_sdk/cli/generate.py +28 -43
  19. lightning_sdk/cli/inspect.py +22 -50
  20. lightning_sdk/cli/list.py +281 -222
  21. lightning_sdk/cli/mmts_menu.py +1 -1
  22. lightning_sdk/cli/open.py +62 -0
  23. lightning_sdk/cli/run.py +430 -263
  24. lightning_sdk/cli/serve.py +128 -191
  25. lightning_sdk/cli/start.py +55 -36
  26. lightning_sdk/cli/stop.py +97 -55
  27. lightning_sdk/cli/switch.py +53 -36
  28. lightning_sdk/cli/upload.py +318 -255
  29. lightning_sdk/deployment/__init__.py +2 -0
  30. lightning_sdk/deployment/deployment.py +33 -8
  31. lightning_sdk/lightning_cloud/openapi/__init__.py +23 -0
  32. lightning_sdk/lightning_cloud/openapi/api/__init__.py +1 -0
  33. lightning_sdk/lightning_cloud/openapi/api/assistants_service_api.py +10 -6
  34. lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +355 -4
  35. lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +4 -4
  36. lightning_sdk/lightning_cloud/openapi/api/lit_registry_service_api.py +14 -2
  37. lightning_sdk/lightning_cloud/openapi/api/pipelines_service_api.py +674 -0
  38. lightning_sdk/lightning_cloud/openapi/api/storage_service_api.py +303 -4
  39. lightning_sdk/lightning_cloud/openapi/models/__init__.py +22 -0
  40. lightning_sdk/lightning_cloud/openapi/models/agents_id_body.py +17 -69
  41. lightning_sdk/lightning_cloud/openapi/models/cluster_id_capacityreservations_body.py +27 -1
  42. lightning_sdk/lightning_cloud/openapi/models/create.py +27 -1
  43. lightning_sdk/lightning_cloud/openapi/models/create_deployment_request_defines_a_spec_for_the_job_that_allows_for_autoscaling_jobs.py +53 -1
  44. lightning_sdk/lightning_cloud/openapi/models/deployments_id_body.py +105 -1
  45. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body1.py +1 -27
  46. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body2.py +149 -0
  47. lightning_sdk/lightning_cloud/openapi/models/org_id_memberships_body.py +27 -1
  48. lightning_sdk/lightning_cloud/openapi/models/orgs_id_body.py +157 -1
  49. lightning_sdk/lightning_cloud/openapi/models/pipelines_id_body.py +461 -0
  50. lightning_sdk/lightning_cloud/openapi/models/project_id_pipelines_body.py +227 -0
  51. lightning_sdk/lightning_cloud/openapi/models/projects_id_body.py +157 -1
  52. lightning_sdk/lightning_cloud/openapi/models/slurm_jobs_body.py +79 -1
  53. lightning_sdk/lightning_cloud/openapi/models/uploads_upload_id_body.py +1 -27
  54. lightning_sdk/lightning_cloud/openapi/models/uploads_upload_id_body1.py +175 -0
  55. lightning_sdk/lightning_cloud/openapi/models/v1_agent_job.py +79 -1
  56. lightning_sdk/lightning_cloud/openapi/models/v1_assistant.py +17 -69
  57. lightning_sdk/lightning_cloud/openapi/models/v1_capacity_block_offering.py +27 -1
  58. lightning_sdk/lightning_cloud/openapi/models/v1_cloud_space_artifact_event_type.py +1 -1
  59. lightning_sdk/lightning_cloud/openapi/models/v1_cluster_accelerator.py +131 -1
  60. lightning_sdk/lightning_cloud/openapi/models/v1_cluster_capacity_reservation.py +79 -1
  61. lightning_sdk/lightning_cloud/openapi/models/v1_cluster_security_options.py +27 -1
  62. lightning_sdk/lightning_cloud/openapi/models/v1_complete_upload_temporary_artifact_request.py +175 -0
  63. lightning_sdk/lightning_cloud/openapi/models/v1_create_deployment_request.py +461 -0
  64. lightning_sdk/lightning_cloud/openapi/models/v1_create_deployment_template_request.py +27 -1
  65. lightning_sdk/lightning_cloud/openapi/models/v1_create_job_request.py +201 -0
  66. lightning_sdk/lightning_cloud/openapi/models/v1_create_managed_endpoint_response.py +149 -0
  67. lightning_sdk/lightning_cloud/openapi/models/v1_create_multi_machine_job_request.py +253 -0
  68. lightning_sdk/lightning_cloud/openapi/models/v1_data_connection.py +27 -1
  69. lightning_sdk/lightning_cloud/openapi/models/v1_delete_pipeline_response.py +149 -0
  70. lightning_sdk/lightning_cloud/openapi/models/v1_deployment.py +105 -1
  71. lightning_sdk/lightning_cloud/openapi/models/v1_deployment_details.py +175 -0
  72. lightning_sdk/lightning_cloud/openapi/models/v1_deployment_template.py +53 -1
  73. lightning_sdk/lightning_cloud/openapi/models/v1_filestore_data_connection.py +201 -0
  74. lightning_sdk/lightning_cloud/openapi/models/v1_filesystem_job.py +53 -1
  75. lightning_sdk/lightning_cloud/openapi/models/v1_filesystem_mmt.py +53 -1
  76. lightning_sdk/lightning_cloud/openapi/models/v1_find_capacity_block_offering_response.py +29 -3
  77. lightning_sdk/lightning_cloud/openapi/models/v1_job.py +133 -3
  78. lightning_sdk/lightning_cloud/openapi/models/v1_job_artifacts_type.py +103 -0
  79. lightning_sdk/lightning_cloud/openapi/models/v1_job_spec.py +53 -1
  80. lightning_sdk/lightning_cloud/openapi/models/v1_job_timing.py +27 -1
  81. lightning_sdk/lightning_cloud/openapi/models/v1_list_pipelines_response.py +123 -0
  82. lightning_sdk/lightning_cloud/openapi/models/v1_lit_registry_artifact.py +27 -1
  83. lightning_sdk/lightning_cloud/openapi/models/v1_lit_repository.py +29 -1
  84. lightning_sdk/lightning_cloud/openapi/models/v1_managed_model.py +27 -1
  85. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
  86. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_state.py +2 -0
  87. lightning_sdk/lightning_cloud/openapi/models/v1_organization.py +209 -1
  88. lightning_sdk/lightning_cloud/openapi/models/v1_pipeline.py +513 -0
  89. lightning_sdk/lightning_cloud/openapi/models/v1_pipeline_schedule.py +149 -0
  90. lightning_sdk/lightning_cloud/openapi/models/v1_pipeline_step.py +253 -0
  91. lightning_sdk/lightning_cloud/openapi/models/v1_pipeline_step_status.py +331 -0
  92. lightning_sdk/lightning_cloud/openapi/models/v1_pipeline_step_type.py +104 -0
  93. lightning_sdk/lightning_cloud/openapi/models/v1_project_settings.py +157 -1
  94. lightning_sdk/lightning_cloud/openapi/models/v1_restart_timing.py +27 -1
  95. lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
  96. lightning_sdk/lightning_cloud/openapi/models/v1_shared_filesystem.py +201 -0
  97. lightning_sdk/lightning_cloud/openapi/models/v1_slurm_job.py +27 -1
  98. lightning_sdk/lightning_cloud/openapi/models/v1_update_job_visibility_response.py +97 -0
  99. lightning_sdk/lightning_cloud/openapi/models/v1_upload_temporary_artifact_request.py +123 -0
  100. lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +95 -355
  101. lightning_sdk/lightning_cloud/openapi/models/validate.py +27 -1
  102. lightning_sdk/lightning_cloud/rest_client.py +4 -2
  103. lightning_sdk/machine.py +25 -1
  104. lightning_sdk/models.py +18 -12
  105. lightning_sdk/pipeline/__init__.py +4 -0
  106. lightning_sdk/pipeline/pipeline.py +109 -0
  107. lightning_sdk/pipeline/types.py +268 -0
  108. lightning_sdk/pipeline/utils.py +69 -0
  109. lightning_sdk/plugin.py +9 -10
  110. lightning_sdk/serve.py +134 -0
  111. lightning_sdk/services/utilities.py +2 -2
  112. lightning_sdk/studio.py +5 -1
  113. lightning_sdk/teamspace.py +1 -1
  114. lightning_sdk/utils/resolve.py +12 -1
  115. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/METADATA +6 -8
  116. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/RECORD +120 -88
  117. lightning_sdk/cli/legacy.py +0 -135
  118. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/LICENSE +0 -0
  119. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/WHEEL +0 -0
  120. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/entry_points.txt +0 -0
  121. {lightning_sdk-0.1.58.dist-info → lightning_sdk-0.2.1.dist-info}/top_level.txt +0 -0
lightning_sdk/cli/run.py CHANGED
@@ -1,274 +1,441 @@
1
- from typing import TYPE_CHECKING, Dict, Optional, Union
1
+ import json
2
+ from typing import Dict, Mapping, Optional, Sequence, Union
3
+
4
+ import click
2
5
 
3
6
  from lightning_sdk.job import Job
4
7
  from lightning_sdk.machine import Machine
5
8
  from lightning_sdk.mmt import MMT
6
9
  from lightning_sdk.teamspace import Teamspace
7
10
 
8
- if TYPE_CHECKING:
9
- from lightning_sdk.cli.legacy import _LegacyLightningCLI
10
-
11
11
  _MACHINE_VALUES = tuple([machine.name for machine in Machine.__dict__.values() if isinstance(machine, Machine)])
12
12
 
13
13
 
14
- class _Run:
14
+ @click.group(name="run")
15
+ def run() -> None:
15
16
  """Run async workloads on the Lightning AI platform."""
16
17
 
17
- def __init__(self, legacy_run: Optional["_LegacyLightningCLI"] = None) -> None:
18
- if legacy_run is not None:
19
- self.app = legacy_run.app
20
- self.model = legacy_run.model
21
-
22
- # Need to set the docstring here for f-strings to work.
23
- # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
24
- # and fire does not show values for literals, just that it is a literal.
25
- docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
26
-
27
- Args:
28
- name: The name of the job. Needs to be unique within the teamspace.
29
- machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
30
- command: The command to run inside your job. Required if using a studio. Optional if using an image.
31
- If not provided for images, will run the container entrypoint and default command.
32
- studio: The studio env to run the job with. Mutually exclusive with image.
33
- image: The docker image to run the job with. Mutually exclusive with studio.
34
- teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
35
- org: The organization owning the teamspace (if any). Defaults to the current organization.
36
- user: The user owning the teamspace (if any). Defaults to the current user.
37
- cloud_account: The cloud account to run the job on.
38
- Defaults to the studio cloud account if running with studio compute env.
39
- If not provided will fall back to the teamspaces default cloud account.
40
- env: Environment variables to set inside the job.
41
- interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
42
- image_credentials: The credentials used to pull the image. Required if the image is private.
43
- This should be the name of the respective credentials secret created on the Lightning AI platform.
44
- cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
45
- Required if the registry is part of a cloud provider (e.g. ECR).
46
- entrypoint: The entrypoint of your docker container. Defaults to `sh -c` which
47
- just runs the provided command in a standard shell.
48
- To use the pre-defined entrypoint of the provided image, set this to an empty string.
49
- Only applicable when submitting docker jobs.
50
- path_mappings: Maps path inside of containers to paths inside data-connections.
51
- Should be a comma separated list of form:
52
- <MAPPING_1>,<MAPPING_2>,...
53
- where each mapping is of the form
54
- <CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and
55
- omitting the path inside the connection defaults to the connections root.
56
- artifacts_local: Deprecated in favor of path_mappings.
57
- The path of inside the docker container, you want to persist images from.
58
- CAUTION: When setting this to "/", it will effectively erase your container.
59
- Only supported for jobs with a docker image compute environment.
60
- artifacts_remote: Deprecated in favor of path_mappings.
61
- The remote storage to persist your artifacts to.
62
- Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
63
- PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
64
- E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
65
- within it.
66
- Note that the connection needs to be added to the teamspace already in order for it to be found.
67
- Only supported for jobs with a docker image compute environment.
68
- """
69
- # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
70
- # might need to switch to explicit cli definition
71
- self.job.__func__.__doc__ = docstr_job
72
-
73
- # Need to set the docstring here for f-strings to work.
74
- # Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
75
- # and fire does not show values for literals, just that it is a literal.
76
- docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
77
-
78
- Args:
79
- name: The name of the job. Needs to be unique within the teamspace.
80
- num_machines: The number of Machines to run on. Defaults to 2 Machines
81
- machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
82
- command: The command to run inside your job. Required if using a studio. Optional if using an image.
83
- If not provided for images, will run the container entrypoint and default command.
84
- studio: The studio env to run the job with. Mutually exclusive with image.
85
- image: The docker image to run the job with. Mutually exclusive with studio.
86
- teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
87
- org: The organization owning the teamspace (if any). Defaults to the current organization.
88
- user: The user owning the teamspace (if any). Defaults to the current user.
89
- cloud_account: The cloud account to run the job on.
90
- Defaults to the studio cloud account if running with studio compute env.
91
- If not provided will fall back to the teamspaces default cloud account.
92
- env: Environment variables to set inside the job.
93
- interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
94
- image_credentials: The credentials used to pull the image. Required if the image is private.
95
- This should be the name of the respective credentials secret created on the Lightning AI platform.
96
- cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
97
- Required if the registry is part of a cloud provider (e.g. ECR).
98
- entrypoint: The entrypoint of your docker container. Defaults to `sh -c` which
99
- just runs the provided command in a standard shell.
100
- To use the pre-defined entrypoint of the provided image, set this to an empty string.
101
- Only applicable when submitting docker jobs.
102
- path_mappings: Maps path inside of containers to paths inside data-connections.
103
- Should be a comma separated list of form:
104
- <MAPPING_1>,<MAPPING_2>,...
105
- where each mapping is of the form
106
- <CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and
107
- omitting the path inside the connection defaults to the connections root.
108
- artifacts_local: Deprecated in favor of path_mappings.
109
- The path of inside the docker container, you want to persist images from.
110
- CAUTION: When setting this to "/", it will effectively erase your container.
111
- Only supported for jobs with a docker image compute environment.
112
- artifacts_remote: Deprecated in favor of path_mappings.
113
- The remote storage to persist your artifacts to.
114
- Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
115
- PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
116
- E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
117
- within it.
118
- Note that the connection needs to be added to the teamspace already in order for it to be found.
119
- Only supported for jobs with a docker image compute environment.
120
- """
121
- # TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
122
- # might need to switch to explicit cli definition
123
- self.mmt.__func__.__doc__ = docstr_mmt
124
-
125
- # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
126
- # see https://github.com/google/python-fire/pull/513
127
- # might need to move to different cli library
128
- def job(
129
- self,
130
- name: Optional[str] = None,
131
- machine: Optional[str] = None,
132
- command: Optional[str] = None,
133
- studio: Optional[str] = None,
134
- image: Optional[str] = None,
135
- teamspace: Optional[str] = None,
136
- org: Optional[str] = None,
137
- user: Optional[str] = None,
138
- cloud_account: Optional[str] = None,
139
- env: Optional[Dict[str, str]] = None,
140
- interruptible: bool = False,
141
- image_credentials: Optional[str] = None,
142
- cloud_account_auth: bool = False,
143
- entrypoint: str = "sh -c",
144
- path_mappings: str = "",
145
- artifacts_local: Optional[str] = None,
146
- artifacts_remote: Optional[str] = None,
147
- ) -> None:
148
- if not name:
149
- from datetime import datetime
150
-
151
- timestr = datetime.now().strftime("%b-%d-%H_%M")
152
- name = f"job-{timestr}"
153
-
154
- if machine is None:
155
- # TODO: infer from studio
156
- machine = "CPU"
157
- machine_enum: Union[str, Machine]
158
- try:
159
- machine_enum = getattr(Machine, machine.upper(), Machine(machine, machine))
160
- except KeyError:
161
- machine_enum = machine
162
-
163
- resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
164
-
165
- path_mappings_dict = self._resolve_path_mapping(path_mappings=path_mappings)
166
-
167
- Job.run(
168
- name=name,
169
- machine=machine_enum,
170
- command=command,
171
- studio=studio,
172
- image=image,
173
- teamspace=resolved_teamspace,
174
- org=org,
175
- user=user,
176
- cloud_account=cloud_account,
177
- env=env,
178
- interruptible=interruptible,
179
- image_credentials=image_credentials,
180
- cloud_account_auth=cloud_account_auth,
181
- entrypoint=entrypoint,
182
- path_mappings=path_mappings_dict,
183
- artifacts_local=artifacts_local,
184
- artifacts_remote=artifacts_remote,
185
- )
186
-
187
- # TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
188
- # see https://github.com/google/python-fire/pull/513
189
- # might need to move to different cli library
190
- def mmt(
191
- self,
192
- name: Optional[str] = None,
193
- num_machines: int = 2,
194
- machine: Optional[str] = None,
195
- command: Optional[str] = None,
196
- image: Optional[str] = None,
197
- teamspace: Optional[str] = None,
198
- org: Optional[str] = None,
199
- user: Optional[str] = None,
200
- cloud_account: Optional[str] = None,
201
- env: Optional[Dict[str, str]] = None,
202
- interruptible: bool = False,
203
- image_credentials: Optional[str] = None,
204
- cloud_account_auth: bool = False,
205
- entrypoint: str = "sh -c",
206
- path_mappings: str = "",
207
- artifacts_local: Optional[str] = None,
208
- artifacts_remote: Optional[str] = None,
209
- ) -> None:
210
- if name is None:
211
- from datetime import datetime
212
-
213
- timestr = datetime.now().strftime("%b-%d-%H_%M")
214
- name = f"mmt-{timestr}"
215
-
216
- if machine is None:
217
- # TODO: infer from studio
218
- machine = "CPU"
219
- machine_enum: Union[str, Machine]
220
- try:
221
- machine_enum = getattr(Machine, machine.upper(), Machine(machine, machine))
222
- except KeyError:
223
- machine_enum = machine
224
-
225
- resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
226
-
227
- if image is None:
228
- raise RuntimeError("Image needs to be specified to run a multi-machine job")
229
-
230
- path_mappings_dict = self._resolve_path_mapping(path_mappings=path_mappings)
231
-
232
- MMT.run(
233
- name=name,
234
- num_machines=num_machines,
235
- machine=machine_enum,
236
- command=command,
237
- studio=None,
238
- image=image,
239
- teamspace=resolved_teamspace,
240
- org=org,
241
- user=user,
242
- cloud_account=cloud_account,
243
- env=env,
244
- interruptible=interruptible,
245
- image_credentials=image_credentials,
246
- cloud_account_auth=cloud_account_auth,
247
- entrypoint=entrypoint,
248
- path_mappings=path_mappings_dict,
249
- artifacts_local=artifacts_local,
250
- artifacts_remote=artifacts_remote,
251
- )
252
-
253
- @staticmethod
254
- def _resolve_path_mapping(path_mappings: str) -> Dict[str, str]:
255
- path_mappings = path_mappings.strip()
256
-
257
- if not path_mappings:
258
- return {}
259
-
260
- path_mappings_dict = {}
261
- for mapping in path_mappings.split(","):
262
- if not mapping.strip():
263
- continue
264
-
265
- splits = str(mapping).split(":", 1)
266
- if len(splits) != 2:
267
- raise RuntimeError(
268
- "Mapping needs to be of form <CONTAINER_PATH>:<CONNECTION_NAME>[:<PATH_WITHIN_CONNECTION>], "
269
- f"but got {mapping}"
270
- )
271
-
272
- path_mappings_dict[splits[0].strip()] = splits[1].strip()
273
-
274
- return path_mappings_dict
18
+
19
+ @run.command("job")
20
+ @click.option("--name", default=None, help="The name of the job. Needs to be unique within the teamspace.")
21
+ @click.option(
22
+ "--machine",
23
+ default="CPU",
24
+ show_default=True,
25
+ type=click.Choice(_MACHINE_VALUES),
26
+ help="The machine type to run the job on.",
27
+ )
28
+ @click.option(
29
+ "--command",
30
+ default=None,
31
+ help=(
32
+ "The command to run inside your job. "
33
+ "Required if using a studio. "
34
+ "Optional if using an image. "
35
+ "If not provided for images, will run the container entrypoint and default command."
36
+ ),
37
+ )
38
+ @click.option("--studio", default=None, help="The studio env to run the job with. Mutually exclusive with image.")
39
+ @click.option("--image", default=None, help="The docker image to run the job with. Mutually exclusive with studio.")
40
+ @click.option(
41
+ "--teamspace",
42
+ default=None,
43
+ help="The teamspace the job should be associated with. Defaults to the current teamspace.",
44
+ )
45
+ @click.option(
46
+ "--org",
47
+ default=None,
48
+ help="The organization owning the teamspace (if any). Defaults to the current organization.",
49
+ )
50
+ @click.option("--user", default=None, help="The user owning the teamspace (if any). Defaults to the current user.")
51
+ @click.option(
52
+ "--cloud-account",
53
+ "--cloud_account",
54
+ default=None,
55
+ help=(
56
+ "The cloud account to run the job on. "
57
+ "Defaults to the studio cloud account if running with studio compute env. "
58
+ "If not provided will fall back to the teamspaces default cloud account."
59
+ ),
60
+ )
61
+ @click.option(
62
+ "--env",
63
+ "-e",
64
+ default=[""],
65
+ help=("Environment variable to set inside the job. Should be of format KEY=VALUE"),
66
+ multiple=True,
67
+ )
68
+ @click.option(
69
+ "--interruptible",
70
+ is_flag=True,
71
+ flag_value=True,
72
+ default=False,
73
+ help="Whether the job should run on interruptible instances. They are cheaper but can be preempted.",
74
+ )
75
+ @click.option(
76
+ "--image-credentials",
77
+ "--image_credentials",
78
+ default=None,
79
+ help=(
80
+ "The credentials used to pull the image. "
81
+ "Required if the image is private. "
82
+ "This should be the name of the respective credentials secret created on the Lightning AI platform."
83
+ ),
84
+ )
85
+ @click.option(
86
+ "--cloud-account-auth",
87
+ "--cloud_account_auth",
88
+ is_flag=True,
89
+ default=False,
90
+ help=(
91
+ "Whether to authenticate with the cloud account to pull the image. "
92
+ "Required if the registry is part of a cloud provider (e.g. ECR)."
93
+ ),
94
+ )
95
+ @click.option(
96
+ "--entrypoint",
97
+ default="sh -c",
98
+ show_default=True,
99
+ help=(
100
+ "The entrypoint of your docker container. "
101
+ "Default runs the provided command in a standard shell. "
102
+ "To use the pre-defined entrypoint of the provided image, set this to an empty string. "
103
+ "Only applicable when submitting docker jobs."
104
+ ),
105
+ )
106
+ @click.option(
107
+ "--path-mapping",
108
+ "--path_mapping",
109
+ default=[""],
110
+ help=(
111
+ "Maps path inside of containers to paths inside data-connections. "
112
+ "Should be of form <CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and "
113
+ "omitting the path inside the connection defaults to the connections root. "
114
+ "Can be specified multiple times for multiple mappings"
115
+ ),
116
+ multiple=True,
117
+ )
118
+ # this is for backwards compatibility only
119
+ @click.option(
120
+ "--path-mappings",
121
+ "--path_mappings",
122
+ default="",
123
+ help=(
124
+ "Maps path inside of containers to paths inside data-connections. "
125
+ "Should be a comma separated list of form: "
126
+ "<MAPPING_1>,<MAPPING_2>,... "
127
+ "where each mapping is of the form "
128
+ "<CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and "
129
+ "omitting the path inside the connection defaults to the connections root. "
130
+ "Instead of a comma-separated list, consider passing --path-mapping multiple times."
131
+ ),
132
+ )
133
+ def job(
134
+ name: Optional[str] = None,
135
+ machine: str = "CPU",
136
+ command: Optional[str] = None,
137
+ studio: Optional[str] = None,
138
+ image: Optional[str] = None,
139
+ teamspace: Optional[str] = None,
140
+ org: Optional[str] = None,
141
+ user: Optional[str] = None,
142
+ cloud_account: Optional[str] = None,
143
+ env: Sequence[str] = (),
144
+ interruptible: bool = False,
145
+ image_credentials: Optional[str] = None,
146
+ cloud_account_auth: bool = False,
147
+ entrypoint: str = "sh -c",
148
+ path_mapping: Sequence[str] = (),
149
+ path_mappings: str = "",
150
+ artifacts_local: Optional[str] = None,
151
+ artifacts_remote: Optional[str] = None,
152
+ ) -> None:
153
+ """Run async workloads using a docker image or studio."""
154
+ if not name:
155
+ from datetime import datetime
156
+
157
+ timestr = datetime.now().strftime("%b-%d-%H_%M")
158
+ name = f"job-{timestr}"
159
+
160
+ machine_enum: Union[str, Machine]
161
+ try:
162
+ machine_enum = getattr(Machine, machine.upper(), Machine(machine, machine))
163
+ except KeyError:
164
+ machine_enum = machine
165
+
166
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
167
+
168
+ path_mappings_dict = _resolve_path_mapping(path_mappings=path_mappings)
169
+ for mapping in path_mapping:
170
+ path_mappings_dict.update(_resolve_path_mapping(path_mappings=mapping))
171
+
172
+ env_dict = {}
173
+ for e in env:
174
+ env_dict.update(_resolve_envs(e))
175
+
176
+ Job.run(
177
+ name=name,
178
+ machine=machine_enum,
179
+ command=command,
180
+ studio=studio,
181
+ image=image,
182
+ teamspace=resolved_teamspace,
183
+ org=org,
184
+ user=user,
185
+ cloud_account=cloud_account,
186
+ env=env_dict,
187
+ interruptible=interruptible,
188
+ image_credentials=image_credentials,
189
+ cloud_account_auth=cloud_account_auth,
190
+ entrypoint=entrypoint,
191
+ path_mappings=path_mappings_dict,
192
+ artifacts_local=artifacts_local,
193
+ artifacts_remote=artifacts_remote,
194
+ )
195
+
196
+
197
+ @run.command("mmt")
198
+ @click.option("--name", default=None, help="The name of the job. Needs to be unique within the teamspace.")
199
+ @click.option(
200
+ "--num-machines",
201
+ "--num_machines",
202
+ default=2,
203
+ show_default=True,
204
+ help="The number of Machines to run on.",
205
+ )
206
+ @click.option(
207
+ "--machine",
208
+ default="CPU",
209
+ show_default=True,
210
+ type=click.Choice(_MACHINE_VALUES),
211
+ help="The machine type to run the job on.",
212
+ )
213
+ @click.option(
214
+ "--command",
215
+ default=None,
216
+ help=(
217
+ "The command to run inside your job. "
218
+ "Required if using a studio. "
219
+ "Optional if using an image. "
220
+ "If not provided for images, will run the container entrypoint and default command."
221
+ ),
222
+ )
223
+ @click.option(
224
+ "--studio",
225
+ default=None,
226
+ help="The studio env to run the multi-machine job with. Mutually exclusive with image.",
227
+ )
228
+ @click.option(
229
+ "--image",
230
+ default=None,
231
+ help="The docker image to run the multi-machine job with. Mutually exclusive with studio.",
232
+ )
233
+ @click.option(
234
+ "--teamspace",
235
+ default=None,
236
+ help="The teamspace the job should be associated with. Defaults to the current teamspace.",
237
+ )
238
+ @click.option(
239
+ "--org",
240
+ default=None,
241
+ help="The organization owning the teamspace (if any). Defaults to the current organization.",
242
+ )
243
+ @click.option("--user", default=None, help="The user owning the teamspace (if any). Defaults to the current user.")
244
+ @click.option(
245
+ "--cloud-account",
246
+ "--cloud_account",
247
+ default=None,
248
+ help=(
249
+ "The cloud account to run the job on. "
250
+ "Defaults to the studio cloud account if running with studio compute env. "
251
+ "If not provided will fall back to the teamspaces default cloud account."
252
+ ),
253
+ )
254
+ @click.option(
255
+ "--env",
256
+ "-e",
257
+ default=[""],
258
+ help=("Environment variable to set inside the job. Should be of format KEY=VALUE"),
259
+ multiple=True,
260
+ )
261
+ @click.option(
262
+ "--interruptible",
263
+ is_flag=True,
264
+ flag_value=True,
265
+ default=False,
266
+ help="Whether the job should run on interruptible instances. They are cheaper but can be preempted.",
267
+ )
268
+ @click.option(
269
+ "--image-credentials",
270
+ "--image_credentials",
271
+ default=None,
272
+ help=(
273
+ "The credentials used to pull the image. "
274
+ "Required if the image is private. "
275
+ "This should be the name of the respective credentials secret created on the Lightning AI platform."
276
+ ),
277
+ )
278
+ @click.option(
279
+ "--cloud-account-auth",
280
+ "--cloud_account_auth",
281
+ is_flag=True,
282
+ default=False,
283
+ help=(
284
+ "Whether to authenticate with the cloud account to pull the image. "
285
+ "Required if the registry is part of a cloud provider (e.g. ECR)."
286
+ ),
287
+ )
288
+ @click.option(
289
+ "--entrypoint",
290
+ default="sh -c",
291
+ show_default=True,
292
+ help=(
293
+ "The entrypoint of your docker container. "
294
+ "Default runs the provided command in a standard shell. "
295
+ "To use the pre-defined entrypoint of the provided image, set this to an empty string. "
296
+ "Only applicable when submitting docker jobs."
297
+ ),
298
+ )
299
+ @click.option(
300
+ "--path-mapping",
301
+ "--path_mapping",
302
+ default=[""],
303
+ help=(
304
+ "Maps path inside of containers to paths inside data-connections. "
305
+ "Should be of form <CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and "
306
+ "omitting the path inside the connection defaults to the connections root. "
307
+ "Can be specified multiple times for multiple mappings"
308
+ ),
309
+ multiple=True,
310
+ )
311
+ # this is for backwards compatibility only
312
+ @click.option(
313
+ "--path-mappings",
314
+ "--path_mappings",
315
+ default="",
316
+ help=(
317
+ "Maps path inside of containers to paths inside data-connections. "
318
+ "Should be a comma separated list of form: "
319
+ "<MAPPING_1>,<MAPPING_2>,... "
320
+ "where each mapping is of the form "
321
+ "<CONTAINER_PATH_1>:<CONNECTION_NAME_1>:<PATH_WITHIN_CONNECTION_1> and "
322
+ "omitting the path inside the connection defaults to the connections root. "
323
+ "Instead of a comma-separated list, consider passing --path-mapping multiple times."
324
+ ),
325
+ )
326
+ def mmt(
327
+ name: Optional[str] = None,
328
+ num_machines: int = 2,
329
+ machine: str = "CPU",
330
+ command: Optional[str] = None,
331
+ studio: Optional[str] = None,
332
+ image: Optional[str] = None,
333
+ teamspace: Optional[str] = None,
334
+ org: Optional[str] = None,
335
+ user: Optional[str] = None,
336
+ cloud_account: Optional[str] = None,
337
+ env: Sequence[str] = (),
338
+ interruptible: bool = False,
339
+ image_credentials: Optional[str] = None,
340
+ cloud_account_auth: bool = False,
341
+ entrypoint: str = "sh -c",
342
+ path_mapping: Sequence[str] = (),
343
+ path_mappings: str = "",
344
+ artifacts_local: Optional[str] = None,
345
+ artifacts_remote: Optional[str] = None,
346
+ ) -> None:
347
+ """Run async workloads on multiple machines using a docker image."""
348
+ if name is None:
349
+ from datetime import datetime
350
+
351
+ timestr = datetime.now().strftime("%b-%d-%H_%M")
352
+ name = f"mmt-{timestr}"
353
+
354
+ if machine is None:
355
+ # TODO: infer from studio
356
+ machine = "CPU"
357
+ machine_enum: Union[str, Machine]
358
+ try:
359
+ machine_enum = getattr(Machine, machine.upper(), Machine(machine, machine))
360
+ except KeyError:
361
+ machine_enum = machine
362
+
363
+ resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
364
+
365
+ path_mappings_dict = _resolve_path_mapping(path_mappings=path_mappings)
366
+ for mapping in path_mapping:
367
+ path_mappings_dict.update(_resolve_path_mapping(path_mappings=mapping))
368
+
369
+ env_dict = {}
370
+ for e in env:
371
+ env_dict.update(_resolve_envs(e))
372
+
373
+ MMT.run(
374
+ name=name,
375
+ num_machines=num_machines,
376
+ machine=machine_enum,
377
+ command=command,
378
+ studio=studio,
379
+ image=image,
380
+ teamspace=resolved_teamspace,
381
+ org=org,
382
+ user=user,
383
+ cloud_account=cloud_account,
384
+ env=env_dict,
385
+ interruptible=interruptible,
386
+ image_credentials=image_credentials,
387
+ cloud_account_auth=cloud_account_auth,
388
+ entrypoint=entrypoint,
389
+ path_mappings=path_mappings_dict,
390
+ artifacts_local=artifacts_local,
391
+ artifacts_remote=artifacts_remote,
392
+ )
393
+
394
+
395
+ def _resolve_path_mapping(path_mappings: str) -> Dict[str, str]:
396
+ path_mappings = path_mappings.strip()
397
+
398
+ if not path_mappings:
399
+ return {}
400
+
401
+ path_mappings_dict = {}
402
+ for mapping in path_mappings.split(","):
403
+ if not mapping.strip():
404
+ continue
405
+
406
+ splits = str(mapping).split(":", 1)
407
+ if len(splits) != 2:
408
+ raise RuntimeError(
409
+ "Mapping needs to be of form <CONTAINER_PATH>:<CONNECTION_NAME>[:<PATH_WITHIN_CONNECTION>], "
410
+ f"but got {mapping}"
411
+ )
412
+
413
+ path_mappings_dict[splits[0].strip()] = splits[1].strip()
414
+
415
+ return path_mappings_dict
416
+
417
+
418
+ def _resolve_envs(envs: str) -> Dict[str, str]:
419
+ if not envs:
420
+ return {}
421
+
422
+ # backwards compatibility for supporting env as json dict
423
+ try:
424
+ env_dict = json.loads(envs)
425
+ if isinstance(env_dict, Mapping):
426
+ return dict(env_dict)
427
+
428
+ raise ValueError(f"Env {envs} cannot be parsed as environment variable")
429
+ except json.decoder.JSONDecodeError as e:
430
+ # resolve individual env vars
431
+ env_dict = {}
432
+ splits = envs.split("=", 1)
433
+ if len(splits) == 2:
434
+ key, value = splits
435
+ env_dict.update({key: value})
436
+
437
+ return env_dict
438
+
439
+ raise ValueError(f"Env {envs} cannot be parsed as environment variable: {e!s}") from e
440
+
441
+ return {}