zenml-nightly 0.82.1.dev20250528__py3-none-any.whl → 0.83.0.dev20250529__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zenml/VERSION +1 -1
- zenml/integrations/skypilot/flavors/skypilot_orchestrator_base_vm_config.py +45 -4
- zenml/integrations/skypilot/orchestrators/skypilot_base_vm_orchestrator.py +92 -84
- zenml/integrations/skypilot/orchestrators/skypilot_orchestrator_entrypoint.py +207 -179
- zenml/integrations/skypilot/utils.py +273 -0
- zenml/integrations/skypilot_aws/__init__.py +1 -2
- zenml/integrations/skypilot_azure/__init__.py +1 -2
- zenml/integrations/skypilot_gcp/__init__.py +9 -1
- zenml/integrations/skypilot_kubernetes/__init__.py +2 -3
- zenml/integrations/skypilot_lambda/__init__.py +1 -2
- zenml/zen_stores/migrations/versions/0.83.0_release.py +23 -0
- {zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/METADATA +2 -2
- {zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/RECORD +16 -14
- {zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/LICENSE +0 -0
- {zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/WHEEL +0 -0
- {zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/entry_points.txt +0 -0
zenml/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.83.0.dev20250529
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# permissions and limitations under the License.
|
14
14
|
"""Skypilot orchestrator base config and settings."""
|
15
15
|
|
16
|
-
from typing import Dict, List, Literal, Optional, Union
|
16
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
17
17
|
|
18
18
|
from pydantic import Field
|
19
19
|
|
@@ -67,6 +67,14 @@ class SkypilotBaseOrchestratorSettings(BaseSettings):
|
|
67
67
|
disk_size: the size of the OS disk in GiB.
|
68
68
|
disk_tier: the disk performance tier to use. If None, defaults to
|
69
69
|
``'medium'``.
|
70
|
+
ports: Ports to expose. Could be an integer, a range, or a list of
|
71
|
+
integers and ranges. All ports will be exposed to the public internet.
|
72
|
+
labels: Labels to apply to instances as key-value pairs. These are
|
73
|
+
mapped to cloud-specific implementations (instance tags in AWS,
|
74
|
+
instance labels in GCP, etc.)
|
75
|
+
any_of: List of candidate resources to try in order of preference based on
|
76
|
+
cost (determined by the optimizer).
|
77
|
+
ordered: List of candidate resources to try in the specified order.
|
70
78
|
|
71
79
|
cluster_name: name of the cluster to create/reuse. If None,
|
72
80
|
auto-generate a name.
|
@@ -88,6 +96,19 @@ class SkypilotBaseOrchestratorSettings(BaseSettings):
|
|
88
96
|
stream_logs: if True, show the logs in the terminal.
|
89
97
|
docker_run_args: Optional arguments to pass to the `docker run` command
|
90
98
|
running inside the VM.
|
99
|
+
workdir: Working directory to sync to the VM. Synced to ~/sky_workdir.
|
100
|
+
task_name: Task name used for display purposes.
|
101
|
+
file_mounts: File and storage mounts configuration for remote cluster.
|
102
|
+
envs: Environment variables for the task.
|
103
|
+
task_settings: Dictionary of arbitrary settings to pass to sky.Task().
|
104
|
+
This allows passing future parameters added by SkyPilot without
|
105
|
+
requiring updates to ZenML.
|
106
|
+
resources_settings: Dictionary of arbitrary settings to pass to
|
107
|
+
sky.Resources(). This allows passing future parameters added
|
108
|
+
by SkyPilot without requiring updates to ZenML.
|
109
|
+
launch_settings: Dictionary of arbitrary settings to pass to
|
110
|
+
sky.launch(). This allows passing future parameters added
|
111
|
+
by SkyPilot without requiring updates to ZenML.
|
91
112
|
"""
|
92
113
|
|
93
114
|
# Resources
|
@@ -103,14 +124,18 @@ class SkypilotBaseOrchestratorSettings(BaseSettings):
|
|
103
124
|
)
|
104
125
|
accelerator_args: Optional[Dict[str, str]] = None
|
105
126
|
use_spot: Optional[bool] = None
|
106
|
-
job_recovery:
|
127
|
+
job_recovery: Union[None, str, Dict[str, Any]] = Field(
|
128
|
+
default=None, union_mode="left_to_right"
|
129
|
+
)
|
107
130
|
region: Optional[str] = None
|
108
131
|
zone: Optional[str] = None
|
109
132
|
image_id: Union[Dict[str, str], str, None] = Field(
|
110
133
|
default=None, union_mode="left_to_right"
|
111
134
|
)
|
112
135
|
disk_size: Optional[int] = None
|
113
|
-
disk_tier: Optional[Literal["high", "medium", "low"]] =
|
136
|
+
disk_tier: Optional[Literal["high", "medium", "low", "ultra", "best"]] = (
|
137
|
+
None
|
138
|
+
)
|
114
139
|
|
115
140
|
# Run settings
|
116
141
|
cluster_name: Optional[str] = None
|
@@ -118,9 +143,25 @@ class SkypilotBaseOrchestratorSettings(BaseSettings):
|
|
118
143
|
idle_minutes_to_autostop: Optional[int] = 30
|
119
144
|
down: bool = True
|
120
145
|
stream_logs: bool = True
|
121
|
-
|
122
146
|
docker_run_args: List[str] = []
|
123
147
|
|
148
|
+
# Additional SkyPilot features
|
149
|
+
ports: Union[None, int, str, List[Union[int, str]]] = Field(
|
150
|
+
default=None, union_mode="left_to_right"
|
151
|
+
)
|
152
|
+
labels: Optional[Dict[str, str]] = None
|
153
|
+
any_of: Optional[List[Dict[str, Any]]] = None
|
154
|
+
ordered: Optional[List[Dict[str, Any]]] = None
|
155
|
+
workdir: Optional[str] = None
|
156
|
+
task_name: Optional[str] = None
|
157
|
+
file_mounts: Optional[Dict[str, Any]] = None
|
158
|
+
envs: Optional[Dict[str, str]] = None
|
159
|
+
|
160
|
+
# Future-proofing settings dictionaries
|
161
|
+
task_settings: Dict[str, Any] = {}
|
162
|
+
resources_settings: Dict[str, Any] = {}
|
163
|
+
launch_settings: Dict[str, Any] = {}
|
164
|
+
|
124
165
|
|
125
166
|
class SkypilotBaseOrchestratorConfig(
|
126
167
|
BaseOrchestratorConfig, SkypilotBaseOrchestratorSettings
|
@@ -14,12 +14,12 @@
|
|
14
14
|
"""Implementation of the Skypilot base VM orchestrator."""
|
15
15
|
|
16
16
|
import os
|
17
|
-
import re
|
18
17
|
from abc import abstractmethod
|
19
18
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, cast
|
20
19
|
from uuid import uuid4
|
21
20
|
|
22
21
|
import sky
|
22
|
+
from sky import StatusRefreshMode
|
23
23
|
|
24
24
|
from zenml.entrypoints import PipelineEntrypointConfiguration
|
25
25
|
from zenml.enums import StackComponentType
|
@@ -31,6 +31,15 @@ from zenml.integrations.skypilot.flavors.skypilot_orchestrator_base_vm_config im
|
|
31
31
|
from zenml.integrations.skypilot.orchestrators.skypilot_orchestrator_entrypoint_configuration import (
|
32
32
|
SkypilotOrchestratorEntrypointConfiguration,
|
33
33
|
)
|
34
|
+
from zenml.integrations.skypilot.utils import (
|
35
|
+
create_docker_run_command,
|
36
|
+
prepare_docker_setup,
|
37
|
+
prepare_launch_kwargs,
|
38
|
+
prepare_resources_kwargs,
|
39
|
+
prepare_task_kwargs,
|
40
|
+
sanitize_cluster_name,
|
41
|
+
sky_job_get,
|
42
|
+
)
|
34
43
|
from zenml.logger import get_logger
|
35
44
|
from zenml.orchestrators import (
|
36
45
|
ContainerizedOrchestrator,
|
@@ -252,32 +261,21 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
|
|
252
261
|
entrypoint_str = " ".join(command)
|
253
262
|
arguments_str = " ".join(args)
|
254
263
|
|
255
|
-
task_envs = environment
|
256
|
-
docker_environment_str = " ".join(
|
257
|
-
f"-e {k}={v}" for k, v in environment.items()
|
258
|
-
)
|
259
|
-
custom_run_args = " ".join(settings.docker_run_args)
|
260
|
-
if custom_run_args:
|
261
|
-
custom_run_args += " "
|
262
|
-
|
263
|
-
instance_type = settings.instance_type or self.DEFAULT_INSTANCE_TYPE
|
264
|
+
task_envs = environment.copy()
|
264
265
|
|
265
266
|
# Set up credentials
|
266
267
|
self.setup_credentials()
|
267
268
|
|
268
|
-
#
|
269
|
-
|
269
|
+
# Prepare Docker setup
|
270
|
+
setup, docker_creds_envs = prepare_docker_setup(
|
271
|
+
container_registry_uri=stack.container_registry.config.uri,
|
272
|
+
credentials=stack.container_registry.credentials,
|
273
|
+
use_sudo=True, # Base orchestrator uses sudo
|
274
|
+
)
|
270
275
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
f"sudo docker login --username $DOCKER_USERNAME --password "
|
275
|
-
f"$DOCKER_PASSWORD {stack.container_registry.config.uri}"
|
276
|
-
)
|
277
|
-
task_envs["DOCKER_USERNAME"] = docker_username
|
278
|
-
task_envs["DOCKER_PASSWORD"] = docker_password
|
279
|
-
else:
|
280
|
-
setup = None
|
276
|
+
# Update task_envs with Docker credentials
|
277
|
+
if docker_creds_envs:
|
278
|
+
task_envs.update(docker_creds_envs)
|
281
279
|
|
282
280
|
# Run the entire pipeline
|
283
281
|
|
@@ -291,45 +289,49 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
|
|
291
289
|
down = False
|
292
290
|
idle_minutes_to_autostop = None
|
293
291
|
else:
|
294
|
-
run_command =
|
292
|
+
run_command = create_docker_run_command(
|
293
|
+
image=image,
|
294
|
+
entrypoint_str=entrypoint_str,
|
295
|
+
arguments_str=arguments_str,
|
296
|
+
environment=task_envs,
|
297
|
+
docker_run_args=settings.docker_run_args,
|
298
|
+
use_sudo=True, # Base orchestrator uses sudo
|
299
|
+
)
|
295
300
|
down = settings.down
|
296
301
|
idle_minutes_to_autostop = settings.idle_minutes_to_autostop
|
297
|
-
|
298
|
-
|
302
|
+
|
303
|
+
# Create the Task with all parameters and task settings
|
304
|
+
task_kwargs = prepare_task_kwargs(
|
305
|
+
settings=settings,
|
306
|
+
run_command=run_command,
|
299
307
|
setup=setup,
|
300
|
-
|
308
|
+
task_envs=task_envs,
|
309
|
+
task_name=f"{orchestrator_run_name}",
|
301
310
|
)
|
311
|
+
|
312
|
+
task = sky.Task(**task_kwargs)
|
302
313
|
logger.debug(f"Running run: {run_command}")
|
303
314
|
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
use_spot=settings.use_spot,
|
313
|
-
job_recovery=settings.job_recovery,
|
314
|
-
region=settings.region,
|
315
|
-
zone=settings.zone,
|
316
|
-
image_id=image
|
317
|
-
if isinstance(self.cloud, sky.clouds.Kubernetes)
|
318
|
-
else settings.image_id,
|
319
|
-
disk_size=settings.disk_size,
|
320
|
-
disk_tier=settings.disk_tier,
|
321
|
-
)
|
315
|
+
# Set resources with all parameters and resource settings
|
316
|
+
resources_kwargs = prepare_resources_kwargs(
|
317
|
+
cloud=self.cloud,
|
318
|
+
settings=settings,
|
319
|
+
default_instance_type=self.DEFAULT_INSTANCE_TYPE,
|
320
|
+
kubernetes_image=image
|
321
|
+
if isinstance(self.cloud, sky.clouds.Kubernetes)
|
322
|
+
else None,
|
322
323
|
)
|
323
|
-
|
324
|
-
|
325
|
-
# Could also be a parameter in the settings to control this behavior
|
326
|
-
detach_run = not settings.stream_logs
|
324
|
+
|
325
|
+
task = task.set_resources(sky.Resources(**resources_kwargs))
|
327
326
|
|
328
327
|
launch_new_cluster = True
|
329
328
|
if settings.cluster_name:
|
330
|
-
|
331
|
-
refresh=
|
329
|
+
status_request_id = sky.status(
|
330
|
+
refresh=StatusRefreshMode.AUTO,
|
331
|
+
cluster_names=[settings.cluster_name],
|
332
332
|
)
|
333
|
+
cluster_info = sky.stream_and_get(status_request_id)
|
334
|
+
|
333
335
|
if cluster_info:
|
334
336
|
logger.info(
|
335
337
|
f"Found existing cluster {settings.cluster_name}. Reusing..."
|
@@ -342,7 +344,7 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
|
|
342
344
|
)
|
343
345
|
cluster_name = settings.cluster_name
|
344
346
|
else:
|
345
|
-
cluster_name =
|
347
|
+
cluster_name = sanitize_cluster_name(
|
346
348
|
f"{orchestrator_run_name}"
|
347
349
|
)
|
348
350
|
logger.info(
|
@@ -350,33 +352,55 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
|
|
350
352
|
)
|
351
353
|
|
352
354
|
if launch_new_cluster:
|
353
|
-
|
355
|
+
# Prepare launch parameters with additional launch settings
|
356
|
+
launch_kwargs = prepare_launch_kwargs(
|
357
|
+
settings=settings,
|
358
|
+
down=down,
|
359
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
360
|
+
)
|
361
|
+
logger.info(
|
362
|
+
f"Launching the task on a new cluster: {cluster_name}"
|
363
|
+
)
|
364
|
+
launch_job_id = sky.launch(
|
354
365
|
task,
|
355
366
|
cluster_name,
|
356
|
-
|
357
|
-
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
358
|
-
down=down,
|
359
|
-
stream_logs=settings.stream_logs,
|
360
|
-
backend=None,
|
361
|
-
detach_setup=True,
|
362
|
-
detach_run=detach_run,
|
367
|
+
**launch_kwargs,
|
363
368
|
)
|
369
|
+
sky_job_get(launch_job_id, settings.stream_logs, cluster_name)
|
370
|
+
|
364
371
|
else:
|
365
|
-
#
|
366
|
-
|
367
|
-
|
372
|
+
# Prepare exec parameters with additional launch settings
|
373
|
+
exec_kwargs = {
|
374
|
+
"down": down,
|
375
|
+
"backend": None,
|
376
|
+
**settings.launch_settings, # Can reuse same settings for exec
|
377
|
+
}
|
378
|
+
|
379
|
+
# Remove None values to avoid overriding SkyPilot defaults
|
380
|
+
exec_kwargs = {
|
381
|
+
k: v for k, v in exec_kwargs.items() if v is not None
|
382
|
+
}
|
383
|
+
|
384
|
+
# Make sure the cluster is up
|
385
|
+
start_request_id = sky.start(
|
368
386
|
settings.cluster_name,
|
369
387
|
down=down,
|
370
388
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
371
389
|
retry_until_up=settings.retry_until_up,
|
372
390
|
)
|
373
|
-
sky.
|
391
|
+
sky.stream_and_get(start_request_id)
|
392
|
+
|
393
|
+
logger.info(
|
394
|
+
f"Executing the task on the cluster: {settings.cluster_name}"
|
395
|
+
)
|
396
|
+
exec_job_id = sky.exec(
|
374
397
|
task,
|
375
|
-
settings.cluster_name,
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
398
|
+
cluster_name=settings.cluster_name,
|
399
|
+
**exec_kwargs,
|
400
|
+
)
|
401
|
+
assert settings.cluster_name is not None
|
402
|
+
sky_job_get(
|
403
|
+
exec_job_id, settings.stream_logs, settings.cluster_name
|
380
404
|
)
|
381
405
|
|
382
406
|
except Exception as e:
|
@@ -386,19 +410,3 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
|
|
386
410
|
finally:
|
387
411
|
# Unset the service connector AWS profile ENV variable
|
388
412
|
self.prepare_environment_variable(set=False)
|
389
|
-
|
390
|
-
def sanitize_cluster_name(self, name: str) -> str:
|
391
|
-
"""Sanitize the value to be used in a cluster name.
|
392
|
-
|
393
|
-
Args:
|
394
|
-
name: Arbitrary input cluster name.
|
395
|
-
|
396
|
-
Returns:
|
397
|
-
Sanitized cluster name.
|
398
|
-
"""
|
399
|
-
name = re.sub(
|
400
|
-
r"[^a-z0-9-]", "-", name.lower()
|
401
|
-
) # replaces any character that is not a lowercase letter, digit, or hyphen with a hyphen
|
402
|
-
name = re.sub(r"^[-]+", "", name) # trim leading hyphens
|
403
|
-
name = re.sub(r"[-]+$", "", name) # trim trailing hyphens
|
404
|
-
return name
|
@@ -32,8 +32,20 @@ from zenml.integrations.skypilot.orchestrators.skypilot_base_vm_orchestrator imp
|
|
32
32
|
ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID,
|
33
33
|
SkypilotBaseOrchestrator,
|
34
34
|
)
|
35
|
+
from zenml.integrations.skypilot.utils import (
|
36
|
+
create_docker_run_command,
|
37
|
+
prepare_docker_setup,
|
38
|
+
prepare_launch_kwargs,
|
39
|
+
prepare_resources_kwargs,
|
40
|
+
prepare_task_kwargs,
|
41
|
+
sanitize_cluster_name,
|
42
|
+
sky_job_get,
|
43
|
+
)
|
35
44
|
from zenml.logger import get_logger
|
36
|
-
from zenml.orchestrators.dag_runner import ThreadedDagRunner
|
45
|
+
from zenml.orchestrators.dag_runner import NodeStatus, ThreadedDagRunner
|
46
|
+
from zenml.orchestrators.publish_utils import (
|
47
|
+
publish_failed_pipeline_run,
|
48
|
+
)
|
37
49
|
from zenml.orchestrators.utils import get_config_environment_vars
|
38
50
|
|
39
51
|
logger = get_logger(__name__)
|
@@ -65,212 +77,228 @@ def main() -> None:
|
|
65
77
|
TypeError: If the active stack's orchestrator is not an instance of
|
66
78
|
SkypilotBaseOrchestrator.
|
67
79
|
ValueError: If the active stack's container registry is None.
|
80
|
+
Exception: If the orchestration or one of the steps fails.
|
68
81
|
"""
|
69
82
|
# Log to the container's stdout so it can be streamed by the client.
|
70
83
|
logger.info("Skypilot orchestrator VM started.")
|
71
84
|
|
72
85
|
# Parse / extract args.
|
73
86
|
args = parse_args()
|
74
|
-
|
75
87
|
orchestrator_run_id = socket.gethostname()
|
76
88
|
|
77
|
-
|
78
|
-
|
79
|
-
pipeline_dag = {
|
80
|
-
step_name: step.spec.upstream_steps
|
81
|
-
for step_name, step in deployment.step_configurations.items()
|
82
|
-
}
|
83
|
-
step_command = StepEntrypointConfiguration.get_entrypoint_command()
|
84
|
-
entrypoint_str = " ".join(step_command)
|
85
|
-
|
86
|
-
active_stack = Client().active_stack
|
87
|
-
|
88
|
-
orchestrator = active_stack.orchestrator
|
89
|
-
if not isinstance(orchestrator, SkypilotBaseOrchestrator):
|
90
|
-
raise TypeError(
|
91
|
-
"The active stack's orchestrator is not an instance of SkypilotBaseOrchestrator."
|
92
|
-
)
|
93
|
-
|
94
|
-
# Set up credentials
|
95
|
-
orchestrator.setup_credentials()
|
96
|
-
|
97
|
-
# Set the service connector AWS profile ENV variable
|
98
|
-
orchestrator.prepare_environment_variable(set=True)
|
89
|
+
run = None
|
99
90
|
|
100
|
-
|
101
|
-
|
102
|
-
if container_registry is None:
|
103
|
-
raise ValueError("Container registry cannot be None.")
|
91
|
+
try:
|
92
|
+
deployment = Client().get_deployment(args.deployment_id)
|
104
93
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
f"docker login --username $DOCKER_USERNAME --password "
|
109
|
-
f"$DOCKER_PASSWORD {container_registry.config.uri}"
|
110
|
-
)
|
111
|
-
task_envs = {
|
112
|
-
"DOCKER_USERNAME": docker_username,
|
113
|
-
"DOCKER_PASSWORD": docker_password,
|
94
|
+
pipeline_dag = {
|
95
|
+
step_name: step.spec.upstream_steps
|
96
|
+
for step_name, step in deployment.step_configurations.items()
|
114
97
|
}
|
115
|
-
|
116
|
-
|
117
|
-
task_envs = None
|
118
|
-
|
119
|
-
unique_resource_configs: Dict[str, str] = {}
|
120
|
-
for step_name, step in deployment.step_configurations.items():
|
121
|
-
settings = cast(
|
122
|
-
SkypilotBaseOrchestratorSettings,
|
123
|
-
orchestrator.get_settings(step),
|
124
|
-
)
|
125
|
-
# Handle both str and Dict[str, int] types for accelerators
|
126
|
-
if isinstance(settings.accelerators, dict):
|
127
|
-
accelerators_hashable = frozenset(settings.accelerators.items())
|
128
|
-
elif isinstance(settings.accelerators, str):
|
129
|
-
accelerators_hashable = frozenset({(settings.accelerators, 1)})
|
130
|
-
else:
|
131
|
-
accelerators_hashable = None
|
132
|
-
resource_config = (
|
133
|
-
settings.instance_type,
|
134
|
-
settings.cpus,
|
135
|
-
settings.memory,
|
136
|
-
settings.disk_size, # Assuming disk_size is part of the settings
|
137
|
-
settings.disk_tier, # Assuming disk_tier is part of the settings
|
138
|
-
settings.use_spot,
|
139
|
-
settings.job_recovery,
|
140
|
-
settings.region,
|
141
|
-
settings.zone,
|
142
|
-
accelerators_hashable,
|
143
|
-
)
|
144
|
-
cluster_name_parts = [
|
145
|
-
orchestrator.sanitize_cluster_name(str(part))
|
146
|
-
for part in resource_config
|
147
|
-
if part is not None
|
148
|
-
]
|
149
|
-
cluster_name = f"cluster-{orchestrator_run_id}" + "-".join(
|
150
|
-
cluster_name_parts
|
151
|
-
)
|
152
|
-
unique_resource_configs[step_name] = cluster_name
|
153
|
-
|
154
|
-
run = Client().list_pipeline_runs(
|
155
|
-
sort_by="asc:created",
|
156
|
-
size=1,
|
157
|
-
deployment_id=args.deployment_id,
|
158
|
-
status=ExecutionStatus.INITIALIZING,
|
159
|
-
)[0]
|
98
|
+
step_command = StepEntrypointConfiguration.get_entrypoint_command()
|
99
|
+
entrypoint_str = " ".join(step_command)
|
160
100
|
|
161
|
-
|
101
|
+
active_stack = Client().active_stack
|
162
102
|
|
163
|
-
|
164
|
-
|
103
|
+
orchestrator = active_stack.orchestrator
|
104
|
+
if not isinstance(orchestrator, SkypilotBaseOrchestrator):
|
105
|
+
raise TypeError(
|
106
|
+
"The active stack's orchestrator is not an instance of SkypilotBaseOrchestrator."
|
107
|
+
)
|
165
108
|
|
166
|
-
|
167
|
-
|
168
|
-
"""
|
169
|
-
cluster_name = unique_resource_configs[step_name]
|
109
|
+
# Set up credentials
|
110
|
+
orchestrator.setup_credentials()
|
170
111
|
|
171
|
-
|
172
|
-
|
173
|
-
)
|
112
|
+
# Set the service connector AWS profile ENV variable
|
113
|
+
orchestrator.prepare_environment_variable(set=True)
|
174
114
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
115
|
+
# get active container registry
|
116
|
+
container_registry = active_stack.container_registry
|
117
|
+
if container_registry is None:
|
118
|
+
raise ValueError("Container registry cannot be None.")
|
179
119
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
120
|
+
# Prepare Docker setup
|
121
|
+
setup, task_envs = prepare_docker_setup(
|
122
|
+
container_registry_uri=container_registry.config.uri,
|
123
|
+
credentials=container_registry.credentials,
|
124
|
+
use_sudo=False, # Entrypoint doesn't use sudo
|
184
125
|
)
|
185
|
-
env = get_config_environment_vars()
|
186
|
-
env[ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID] = orchestrator_run_id
|
187
126
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
custom_run_args += " "
|
194
|
-
|
195
|
-
# Set up the task
|
196
|
-
run_command = f"docker run --rm {custom_run_args}{docker_environment_str} {image} {entrypoint_str} {arguments_str}"
|
197
|
-
task_name = f"{deployment.id}-{step_name}-{time.time()}"
|
198
|
-
task = sky.Task(
|
199
|
-
run=run_command,
|
200
|
-
setup=setup,
|
201
|
-
envs=task_envs,
|
202
|
-
name=task_name,
|
203
|
-
)
|
204
|
-
task = task.set_resources(
|
205
|
-
sky.Resources(
|
206
|
-
cloud=orchestrator.cloud,
|
207
|
-
instance_type=settings.instance_type
|
208
|
-
or orchestrator.DEFAULT_INSTANCE_TYPE,
|
209
|
-
cpus=settings.cpus,
|
210
|
-
memory=settings.memory,
|
211
|
-
disk_size=settings.disk_size,
|
212
|
-
disk_tier=settings.disk_tier,
|
213
|
-
accelerators=settings.accelerators,
|
214
|
-
accelerator_args=settings.accelerator_args,
|
215
|
-
use_spot=settings.use_spot,
|
216
|
-
job_recovery=settings.job_recovery,
|
217
|
-
region=settings.region,
|
218
|
-
zone=settings.zone,
|
219
|
-
image_id=settings.image_id,
|
127
|
+
unique_resource_configs: Dict[str, str] = {}
|
128
|
+
for step_name, step in deployment.step_configurations.items():
|
129
|
+
settings = cast(
|
130
|
+
SkypilotBaseOrchestratorSettings,
|
131
|
+
orchestrator.get_settings(step),
|
220
132
|
)
|
221
|
-
|
133
|
+
# Handle both str and Dict[str, int] types for accelerators
|
134
|
+
if isinstance(settings.accelerators, dict):
|
135
|
+
accelerators_hashable = frozenset(
|
136
|
+
settings.accelerators.items()
|
137
|
+
)
|
138
|
+
elif isinstance(settings.accelerators, str):
|
139
|
+
accelerators_hashable = frozenset({(settings.accelerators, 1)})
|
140
|
+
else:
|
141
|
+
accelerators_hashable = None
|
142
|
+
resource_config = (
|
143
|
+
settings.instance_type,
|
144
|
+
settings.cpus,
|
145
|
+
settings.memory,
|
146
|
+
settings.disk_size, # Assuming disk_size is part of the settings
|
147
|
+
settings.disk_tier, # Assuming disk_tier is part of the settings
|
148
|
+
settings.use_spot,
|
149
|
+
settings.job_recovery,
|
150
|
+
settings.region,
|
151
|
+
settings.zone,
|
152
|
+
accelerators_hashable,
|
153
|
+
)
|
154
|
+
cluster_name_parts = [
|
155
|
+
sanitize_cluster_name(str(part))
|
156
|
+
for part in resource_config
|
157
|
+
if part is not None
|
158
|
+
]
|
159
|
+
cluster_name = f"cluster-{orchestrator_run_id}" + "-".join(
|
160
|
+
cluster_name_parts
|
161
|
+
)
|
162
|
+
unique_resource_configs[step_name] = cluster_name
|
222
163
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
stream_logs=settings.stream_logs,
|
230
|
-
detach_setup=True,
|
231
|
-
detach_run=True,
|
232
|
-
)
|
164
|
+
run = Client().list_pipeline_runs(
|
165
|
+
sort_by="asc:created",
|
166
|
+
size=1,
|
167
|
+
deployment_id=args.deployment_id,
|
168
|
+
status=ExecutionStatus.INITIALIZING,
|
169
|
+
)[0]
|
233
170
|
|
234
|
-
|
235
|
-
logger.info(f"Waiting for pod of step `{step_name}` to start...")
|
171
|
+
logger.info("Fetching pipeline run: %s", run.id)
|
236
172
|
|
237
|
-
|
173
|
+
def run_step_on_skypilot_vm(step_name: str) -> None:
|
174
|
+
"""Run a pipeline step in a separate Skypilot VM.
|
238
175
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
176
|
+
Args:
|
177
|
+
step_name: Name of the step.
|
178
|
+
|
179
|
+
Raises:
|
180
|
+
Exception: If the step execution fails.
|
181
|
+
"""
|
182
|
+
logger.info(f"Running step `{step_name}` on a VM...")
|
243
183
|
try:
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
184
|
+
cluster_name = unique_resource_configs[step_name]
|
185
|
+
|
186
|
+
image = SkypilotBaseOrchestrator.get_image(
|
187
|
+
deployment=deployment, step_name=step_name
|
188
|
+
)
|
189
|
+
|
190
|
+
step_args = (
|
191
|
+
StepEntrypointConfiguration.get_entrypoint_arguments(
|
192
|
+
step_name=step_name, deployment_id=deployment.id
|
193
|
+
)
|
194
|
+
)
|
195
|
+
arguments_str = " ".join(step_args)
|
196
|
+
|
197
|
+
step = deployment.step_configurations[step_name]
|
198
|
+
settings = cast(
|
199
|
+
SkypilotBaseOrchestratorSettings,
|
200
|
+
orchestrator.get_settings(step),
|
201
|
+
)
|
202
|
+
env = get_config_environment_vars()
|
203
|
+
env[ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID] = (
|
204
|
+
orchestrator_run_id
|
205
|
+
)
|
206
|
+
|
207
|
+
# Create the Docker run command
|
208
|
+
run_command = create_docker_run_command(
|
209
|
+
image=image,
|
210
|
+
entrypoint_str=entrypoint_str,
|
211
|
+
arguments_str=arguments_str,
|
212
|
+
environment=env,
|
213
|
+
docker_run_args=settings.docker_run_args,
|
214
|
+
use_sudo=False, # Entrypoint doesn't use sudo
|
215
|
+
)
|
216
|
+
|
217
|
+
task_name = f"{deployment.id}-{step_name}-{time.time()}"
|
218
|
+
|
219
|
+
# Create task kwargs
|
220
|
+
task_kwargs = prepare_task_kwargs(
|
221
|
+
settings=settings,
|
222
|
+
run_command=run_command,
|
223
|
+
setup=setup,
|
224
|
+
task_envs=task_envs,
|
225
|
+
task_name=task_name,
|
226
|
+
)
|
227
|
+
|
228
|
+
task = sky.Task(**task_kwargs)
|
229
|
+
|
230
|
+
# Set resources
|
231
|
+
resources_kwargs = prepare_resources_kwargs(
|
232
|
+
cloud=orchestrator.cloud,
|
233
|
+
settings=settings,
|
234
|
+
default_instance_type=orchestrator.DEFAULT_INSTANCE_TYPE,
|
235
|
+
)
|
236
|
+
|
237
|
+
task = task.set_resources(sky.Resources(**resources_kwargs))
|
238
|
+
|
239
|
+
# Prepare launch parameters
|
240
|
+
launch_kwargs = prepare_launch_kwargs(
|
241
|
+
settings=settings,
|
242
|
+
)
|
243
|
+
|
244
|
+
# sky.launch now returns a request ID (async). Capture it so we can
|
245
|
+
# optionally stream logs and block until completion when desired.
|
246
|
+
launch_request_id = sky.launch(
|
247
|
+
task,
|
248
|
+
cluster_name,
|
249
|
+
**launch_kwargs,
|
250
|
+
)
|
251
|
+
sky_job_get(launch_request_id, True, cluster_name)
|
252
|
+
|
253
|
+
# Pop the resource configuration for this step
|
254
|
+
unique_resource_configs.pop(step_name)
|
255
|
+
|
256
|
+
if cluster_name in unique_resource_configs.values():
|
257
|
+
# If there are more steps using this configuration, skip deprovisioning the cluster
|
258
|
+
logger.info(
|
259
|
+
f"Resource configuration for cluster '{cluster_name}' "
|
260
|
+
"is used by subsequent steps. Skipping the deprovisioning of "
|
261
|
+
"the cluster."
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
# If there are no more steps using this configuration, down the cluster
|
265
|
+
logger.info(
|
266
|
+
f"Resource configuration for cluster '{cluster_name}' "
|
267
|
+
"is not used by subsequent steps. deprovisioning the cluster."
|
268
|
+
)
|
269
|
+
down_request_id = sky.down(cluster_name)
|
270
|
+
# Wait for the cluster to be terminated
|
271
|
+
sky.stream_and_get(down_request_id)
|
272
|
+
|
273
|
+
logger.info(
|
274
|
+
f"Running step `{step_name}` on a VM is completed."
|
275
|
+
)
|
276
|
+
|
277
|
+
except Exception as e:
|
278
|
+
logger.error(f"Failed while launching step `{step_name}`: {e}")
|
279
|
+
raise
|
280
|
+
|
281
|
+
dag_runner = ThreadedDagRunner(
|
282
|
+
dag=pipeline_dag, run_fn=run_step_on_skypilot_vm
|
283
|
+
)
|
284
|
+
dag_runner.run()
|
285
|
+
|
286
|
+
failed_nodes = []
|
287
|
+
for node in dag_runner.nodes:
|
288
|
+
if dag_runner.node_states[node] == NodeStatus.FAILED:
|
289
|
+
failed_nodes.append(node)
|
268
290
|
|
269
|
-
|
291
|
+
if failed_nodes:
|
292
|
+
raise Exception(f"One or more steps failed: {failed_nodes}")
|
270
293
|
|
271
|
-
|
294
|
+
except Exception as e:
|
295
|
+
logger.error(f"Orchestrator failed: {e}")
|
272
296
|
|
273
|
-
|
297
|
+
# Try to mark the pipeline run as failed
|
298
|
+
if run:
|
299
|
+
publish_failed_pipeline_run(run.id)
|
300
|
+
logger.info("Marked pipeline run as failed in ZenML.")
|
301
|
+
raise
|
274
302
|
|
275
303
|
|
276
304
|
if __name__ == "__main__":
|
@@ -0,0 +1,273 @@
|
|
1
|
+
"""Utility functions for Skypilot orchestrators."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
5
|
+
|
6
|
+
import sky
|
7
|
+
|
8
|
+
from zenml.integrations.skypilot.flavors.skypilot_orchestrator_base_vm_config import (
|
9
|
+
SkypilotBaseOrchestratorSettings,
|
10
|
+
)
|
11
|
+
from zenml.logger import get_logger
|
12
|
+
|
13
|
+
logger = get_logger(__name__)
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from sky.clouds.cloud import Cloud
|
17
|
+
|
18
|
+
|
19
|
+
def sanitize_cluster_name(name: str) -> str:
|
20
|
+
"""Sanitize the value to be used in a cluster name.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
name: Arbitrary input cluster name.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Sanitized cluster name.
|
27
|
+
"""
|
28
|
+
name = re.sub(
|
29
|
+
r"[^a-z0-9-]", "-", name.lower()
|
30
|
+
) # replaces any character that is not a lowercase letter, digit, or hyphen with a hyphen
|
31
|
+
name = re.sub(r"^[-]+", "", name) # trim leading hyphens
|
32
|
+
name = re.sub(r"[-]+$", "", name) # trim trailing hyphens
|
33
|
+
return name
|
34
|
+
|
35
|
+
|
36
|
+
def prepare_docker_setup(
|
37
|
+
container_registry_uri: str,
|
38
|
+
credentials: Optional[Tuple[str, str]] = None,
|
39
|
+
use_sudo: bool = True,
|
40
|
+
) -> Tuple[Optional[str], Dict[str, str]]:
|
41
|
+
"""Prepare Docker login setup command and environment variables.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
container_registry_uri: URI of the container registry.
|
45
|
+
credentials: Optional credentials (username, password) tuple.
|
46
|
+
use_sudo: Whether to use sudo prefix in docker commands.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
Tuple of (setup command, environment variables)
|
50
|
+
"""
|
51
|
+
if credentials:
|
52
|
+
docker_username, docker_password = credentials
|
53
|
+
sudo_prefix = "sudo " if use_sudo else ""
|
54
|
+
setup = (
|
55
|
+
f"{sudo_prefix}docker login --username $DOCKER_USERNAME --password "
|
56
|
+
f"$DOCKER_PASSWORD {container_registry_uri}"
|
57
|
+
)
|
58
|
+
task_envs = {
|
59
|
+
"DOCKER_USERNAME": docker_username,
|
60
|
+
"DOCKER_PASSWORD": docker_password,
|
61
|
+
}
|
62
|
+
else:
|
63
|
+
setup = None
|
64
|
+
task_envs = {}
|
65
|
+
|
66
|
+
return setup, task_envs
|
67
|
+
|
68
|
+
|
69
|
+
def create_docker_run_command(
|
70
|
+
image: str,
|
71
|
+
entrypoint_str: str,
|
72
|
+
arguments_str: str,
|
73
|
+
environment: Dict[str, str],
|
74
|
+
docker_run_args: List[str],
|
75
|
+
use_sudo: bool = True,
|
76
|
+
) -> str:
|
77
|
+
"""Create a Docker run command string.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
image: Docker image to run.
|
81
|
+
entrypoint_str: Entrypoint command.
|
82
|
+
arguments_str: Command arguments.
|
83
|
+
environment: Environment variables.
|
84
|
+
docker_run_args: Additional Docker run arguments.
|
85
|
+
use_sudo: Whether to use sudo prefix in docker commands.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
Docker run command as string.
|
89
|
+
"""
|
90
|
+
docker_environment_str = " ".join(
|
91
|
+
f"-e {k}={v}" for k, v in environment.items()
|
92
|
+
)
|
93
|
+
custom_run_args = " ".join(docker_run_args)
|
94
|
+
if custom_run_args:
|
95
|
+
custom_run_args += " "
|
96
|
+
|
97
|
+
sudo_prefix = "sudo " if use_sudo else ""
|
98
|
+
return f"{sudo_prefix}docker run --rm {custom_run_args}{docker_environment_str} {image} {entrypoint_str} {arguments_str}"
|
99
|
+
|
100
|
+
|
101
|
+
def prepare_task_kwargs(
|
102
|
+
settings: SkypilotBaseOrchestratorSettings,
|
103
|
+
run_command: str,
|
104
|
+
setup: Optional[str],
|
105
|
+
task_envs: Dict[str, str],
|
106
|
+
task_name: str,
|
107
|
+
) -> Dict[str, Any]:
|
108
|
+
"""Prepare task keyword arguments for sky.Task.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
settings: Skypilot orchestrator settings.
|
112
|
+
run_command: Command to run.
|
113
|
+
setup: Setup command.
|
114
|
+
task_envs: Task environment variables.
|
115
|
+
task_name: Task name.
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
Task keyword arguments dictionary.
|
119
|
+
"""
|
120
|
+
# Merge envs from settings with existing task_envs
|
121
|
+
merged_envs = {}
|
122
|
+
|
123
|
+
# First add user-provided envs
|
124
|
+
if settings.envs:
|
125
|
+
merged_envs.update(settings.envs)
|
126
|
+
|
127
|
+
# Then add task_envs which take precedence
|
128
|
+
if task_envs:
|
129
|
+
merged_envs.update(task_envs)
|
130
|
+
|
131
|
+
task_kwargs = {
|
132
|
+
"run": run_command,
|
133
|
+
"setup": setup,
|
134
|
+
"envs": merged_envs,
|
135
|
+
"name": settings.task_name or task_name,
|
136
|
+
"workdir": settings.workdir,
|
137
|
+
"file_mounts_mapping": settings.file_mounts,
|
138
|
+
**settings.task_settings, # Add any arbitrary task settings
|
139
|
+
}
|
140
|
+
|
141
|
+
# Remove None values to avoid overriding SkyPilot defaults
|
142
|
+
return {k: v for k, v in task_kwargs.items() if v is not None}
|
143
|
+
|
144
|
+
|
145
|
+
def prepare_resources_kwargs(
|
146
|
+
cloud: "Cloud",
|
147
|
+
settings: SkypilotBaseOrchestratorSettings,
|
148
|
+
default_instance_type: Optional[str] = None,
|
149
|
+
kubernetes_image: Optional[str] = None,
|
150
|
+
) -> Dict[str, Any]:
|
151
|
+
"""Prepare resources keyword arguments for sky.Resources.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
cloud: Skypilot cloud.
|
155
|
+
settings: Skypilot orchestrator settings.
|
156
|
+
default_instance_type: Default instance type.
|
157
|
+
kubernetes_image: Image to use for Kubernetes (if applicable).
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Resources keyword arguments dictionary.
|
161
|
+
"""
|
162
|
+
resources_kwargs = {
|
163
|
+
"cloud": cloud,
|
164
|
+
"instance_type": settings.instance_type or default_instance_type,
|
165
|
+
"cpus": settings.cpus,
|
166
|
+
"memory": settings.memory,
|
167
|
+
"accelerators": settings.accelerators,
|
168
|
+
"accelerator_args": settings.accelerator_args,
|
169
|
+
"use_spot": settings.use_spot,
|
170
|
+
"job_recovery": settings.job_recovery,
|
171
|
+
"region": settings.region,
|
172
|
+
"zone": settings.zone,
|
173
|
+
"image_id": kubernetes_image
|
174
|
+
if kubernetes_image
|
175
|
+
else settings.image_id,
|
176
|
+
"disk_size": settings.disk_size,
|
177
|
+
"disk_tier": settings.disk_tier,
|
178
|
+
"ports": settings.ports,
|
179
|
+
"labels": settings.labels,
|
180
|
+
"any_of": settings.any_of,
|
181
|
+
"ordered": settings.ordered,
|
182
|
+
**settings.resources_settings, # Add any arbitrary resource settings
|
183
|
+
}
|
184
|
+
|
185
|
+
# Remove None values to avoid overriding SkyPilot defaults
|
186
|
+
return {k: v for k, v in resources_kwargs.items() if v is not None}
|
187
|
+
|
188
|
+
|
189
|
+
def prepare_launch_kwargs(
|
190
|
+
settings: SkypilotBaseOrchestratorSettings,
|
191
|
+
down: Optional[bool] = None,
|
192
|
+
idle_minutes_to_autostop: Optional[int] = None,
|
193
|
+
) -> Dict[str, Any]:
|
194
|
+
"""Prepare launch keyword arguments for sky.launch.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
settings: Skypilot orchestrator settings.
|
198
|
+
down: Whether to tear down the cluster after job completion.
|
199
|
+
idle_minutes_to_autostop: Minutes to autostop after idleness.
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
Launch keyword arguments dictionary.
|
203
|
+
"""
|
204
|
+
# Determine values falling back to settings where applicable
|
205
|
+
down_value = down if down is not None else settings.down
|
206
|
+
idle_value = (
|
207
|
+
idle_minutes_to_autostop
|
208
|
+
if idle_minutes_to_autostop is not None
|
209
|
+
else settings.idle_minutes_to_autostop
|
210
|
+
)
|
211
|
+
|
212
|
+
# The following parameters were removed from sky.launch in versions > 0.8.
|
213
|
+
# We therefore no longer include them in the kwargs passed to the call.
|
214
|
+
# • stream_logs – handled by explicitly calling sky.stream_and_get
|
215
|
+
# • detach_setup / detach_run – setup/run are now detached by default
|
216
|
+
|
217
|
+
launch_kwargs = {
|
218
|
+
"retry_until_up": settings.retry_until_up,
|
219
|
+
"idle_minutes_to_autostop": idle_value,
|
220
|
+
"down": down_value,
|
221
|
+
"backend": None,
|
222
|
+
**settings.launch_settings, # Keep user-provided extras
|
223
|
+
}
|
224
|
+
|
225
|
+
# Remove keys that are no longer supported by sky.launch.
|
226
|
+
for _deprecated in (
|
227
|
+
"stream_logs",
|
228
|
+
"detach_setup",
|
229
|
+
"detach_run",
|
230
|
+
"num_nodes",
|
231
|
+
):
|
232
|
+
launch_kwargs.pop(_deprecated, None)
|
233
|
+
|
234
|
+
# Remove None values to avoid overriding SkyPilot defaults
|
235
|
+
return {k: v for k, v in launch_kwargs.items() if v is not None}
|
236
|
+
|
237
|
+
|
238
|
+
def sky_job_get(request_id: str, stream_logs: bool, cluster_name: str) -> Any:
|
239
|
+
"""Handle SkyPilot request results based on stream_logs setting.
|
240
|
+
|
241
|
+
SkyPilot API exec and launch methods are asynchronous and return a request ID.
|
242
|
+
This method waits for the operation to complete and returns the result.
|
243
|
+
If stream_logs is True, it will also stream the logs and wait for the
|
244
|
+
job to complete.
|
245
|
+
|
246
|
+
Args:
|
247
|
+
request_id: The request ID returned from a SkyPilot operation.
|
248
|
+
stream_logs: Whether to stream logs while waiting for completion.
|
249
|
+
cluster_name: The name of the cluster to tail logs for.
|
250
|
+
|
251
|
+
Returns:
|
252
|
+
The result of the SkyPilot operation.
|
253
|
+
|
254
|
+
Raises:
|
255
|
+
Exception: If the SkyPilot job fails.
|
256
|
+
"""
|
257
|
+
if stream_logs:
|
258
|
+
# Stream logs and wait for completion
|
259
|
+
job_id, _ = sky.stream_and_get(request_id)
|
260
|
+
else:
|
261
|
+
# Just wait for completion without streaming logs
|
262
|
+
job_id, _ = sky.get(request_id)
|
263
|
+
|
264
|
+
status = 0 # 0=Successful, 100=Failed
|
265
|
+
if stream_logs:
|
266
|
+
status = sky.tail_logs(
|
267
|
+
cluster_name=cluster_name, job_id=job_id, follow=True
|
268
|
+
)
|
269
|
+
|
270
|
+
if status != 0:
|
271
|
+
raise Exception(f"SkyPilot job {job_id} failed with status {status}")
|
272
|
+
|
273
|
+
return job_id
|
@@ -32,7 +32,7 @@ class SkypilotAWSIntegration(Integration):
|
|
32
32
|
|
33
33
|
NAME = SKYPILOT_AWS
|
34
34
|
# all 0.6.x versions of skypilot[aws] are compatible
|
35
|
-
REQUIREMENTS = ["skypilot[aws]
|
35
|
+
REQUIREMENTS = ["skypilot[aws]==0.9.3"]
|
36
36
|
APT_PACKAGES = ["openssh-client", "rsync"]
|
37
37
|
|
38
38
|
@classmethod
|
@@ -47,4 +47,3 @@ class SkypilotAWSIntegration(Integration):
|
|
47
47
|
)
|
48
48
|
|
49
49
|
return [SkypilotAWSOrchestratorFlavor]
|
50
|
-
|
@@ -31,7 +31,7 @@ class SkypilotAzureIntegration(Integration):
|
|
31
31
|
"""Definition of Skypilot (Azure) Integration for ZenML."""
|
32
32
|
|
33
33
|
NAME = SKYPILOT_AZURE
|
34
|
-
REQUIREMENTS = ["skypilot[azure]
|
34
|
+
REQUIREMENTS = ["skypilot[azure]==0.9.3"]
|
35
35
|
APT_PACKAGES = ["openssh-client", "rsync"]
|
36
36
|
|
37
37
|
@classmethod
|
@@ -46,4 +46,3 @@ class SkypilotAzureIntegration(Integration):
|
|
46
46
|
)
|
47
47
|
|
48
48
|
return [SkypilotAzureOrchestratorFlavor]
|
49
|
-
|
@@ -31,7 +31,15 @@ class SkypilotGCPIntegration(Integration):
|
|
31
31
|
"""Definition of Skypilot (GCP) Integration for ZenML."""
|
32
32
|
|
33
33
|
NAME = SKYPILOT_GCP
|
34
|
-
REQUIREMENTS = [
|
34
|
+
REQUIREMENTS = [
|
35
|
+
"skypilot[gcp]==0.9.3",
|
36
|
+
# TODO: Remove this once the issue is fixed:
|
37
|
+
# Adding the dependencies of the GCP integration on top of the
|
38
|
+
# requirements of the skypilot integration results in a
|
39
|
+
# very long resolution time for pip. This is a workaround to
|
40
|
+
# speed up the resolution.
|
41
|
+
"protobuf>=4.25.0,<5.0.0",
|
42
|
+
]
|
35
43
|
APT_PACKAGES = ["openssh-client", "rsync"]
|
36
44
|
|
37
45
|
@classmethod
|
@@ -31,8 +31,8 @@ class SkypilotKubernetesIntegration(Integration):
|
|
31
31
|
"""Definition of Skypilot Kubernetes Integration for ZenML."""
|
32
32
|
|
33
33
|
NAME = SKYPILOT_KUBERNETES
|
34
|
-
|
35
|
-
REQUIREMENTS = ["skypilot[kubernetes]
|
34
|
+
|
35
|
+
REQUIREMENTS = ["skypilot[kubernetes]==0.9.3"]
|
36
36
|
APT_PACKAGES = ["openssh-client", "rsync"]
|
37
37
|
|
38
38
|
@classmethod
|
@@ -47,4 +47,3 @@ class SkypilotKubernetesIntegration(Integration):
|
|
47
47
|
)
|
48
48
|
|
49
49
|
return [SkypilotKubernetesOrchestratorFlavor]
|
50
|
-
|
@@ -31,7 +31,7 @@ class SkypilotLambdaIntegration(Integration):
|
|
31
31
|
"""Definition of Skypilot Lambda Integration for ZenML."""
|
32
32
|
|
33
33
|
NAME = SKYPILOT_LAMBDA
|
34
|
-
REQUIREMENTS = ["skypilot[lambda]
|
34
|
+
REQUIREMENTS = ["skypilot[lambda]==0.9.3"]
|
35
35
|
|
36
36
|
@classmethod
|
37
37
|
def flavors(cls) -> List[Type[Flavor]]:
|
@@ -45,4 +45,3 @@ class SkypilotLambdaIntegration(Integration):
|
|
45
45
|
)
|
46
46
|
|
47
47
|
return [SkypilotLambdaOrchestratorFlavor]
|
48
|
-
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""Release [0.83.0].
|
2
|
+
|
3
|
+
Revision ID: 0.83.0
|
4
|
+
Revises: 0.82.1
|
5
|
+
Create Date: 2025-05-28 08:23:38.860697
|
6
|
+
|
7
|
+
"""
|
8
|
+
|
9
|
+
# revision identifiers, used by Alembic.
|
10
|
+
revision = "0.83.0"
|
11
|
+
down_revision = "0.82.1"
|
12
|
+
branch_labels = None
|
13
|
+
depends_on = None
|
14
|
+
|
15
|
+
|
16
|
+
def upgrade() -> None:
|
17
|
+
"""Upgrade database schema and/or data, creating a new revision."""
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
def downgrade() -> None:
|
22
|
+
"""Downgrade database schema and/or data back to the previous revision."""
|
23
|
+
pass
|
{zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: zenml-nightly
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.83.0.dev20250529
|
4
4
|
Summary: ZenML: Write production-ready ML code.
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: machine learning,production,pipeline,mlops,devops
|
@@ -532,7 +532,7 @@ the Apache License Version 2.0.
|
|
532
532
|
<a href="https://github.com/zenml-io/zenml-projects">Projects Showcase</a>
|
533
533
|
<br />
|
534
534
|
<br />
|
535
|
-
🎉 Version 0.
|
535
|
+
🎉 Version 0.83.0 is out. Check out the release notes
|
536
536
|
<a href="https://github.com/zenml-io/zenml/releases">here</a>.
|
537
537
|
<br />
|
538
538
|
🖥️ Download our VS Code Extension <a href="https://marketplace.visualstudio.com/items?itemName=ZenML.zenml-vscode">here</a>.
|
{zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/RECORD
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
zenml/README.md,sha256=827dekbOWAs1BpW7VF1a4d7EbwPbjwccX-2zdXBENZo,1777
|
2
|
-
zenml/VERSION,sha256=
|
2
|
+
zenml/VERSION,sha256=b8jbIhORl12qciUdvca3yIU9TzeD-GypE5YtYcFmLiM,19
|
3
3
|
zenml/__init__.py,sha256=CKEyepFK-7akXYiMrNVh92Nb01Cjs23w4_YyI6sgdc8,2242
|
4
4
|
zenml/actions/__init__.py,sha256=mrt6wPo73iKRxK754_NqsGyJ3buW7RnVeIGXr1xEw8Y,681
|
5
5
|
zenml/actions/base_action.py,sha256=UcaHev6BTuLDwuswnyaPjdA8AgUqB5xPZ-lRtuvf2FU,25553
|
@@ -478,32 +478,33 @@ zenml/integrations/sklearn/materializers/__init__.py,sha256=vHIx7njuVcP_kGbyECgJ
|
|
478
478
|
zenml/integrations/sklearn/materializers/sklearn_materializer.py,sha256=IaE8DS9L2gjfRVv8RT8p7ls7cJJ5qzt6hRgdpL6pp20,1562
|
479
479
|
zenml/integrations/skypilot/__init__.py,sha256=PzNuJJykzfu34pTC4kSD18ARyGysuhzEW8oLTm4oHuo,612
|
480
480
|
zenml/integrations/skypilot/flavors/__init__.py,sha256=PzNuJJykzfu34pTC4kSD18ARyGysuhzEW8oLTm4oHuo,612
|
481
|
-
zenml/integrations/skypilot/flavors/skypilot_orchestrator_base_vm_config.py,sha256=
|
481
|
+
zenml/integrations/skypilot/flavors/skypilot_orchestrator_base_vm_config.py,sha256=pYr_Wi2rjh6XTpHDLYBC7guPM23MzlUoIadzMDckKMI,8840
|
482
482
|
zenml/integrations/skypilot/orchestrators/__init__.py,sha256=elo6QiImzys1m_bgu96U1HCVRepHQI1m7Jgqn6aEJ4Q,844
|
483
|
-
zenml/integrations/skypilot/orchestrators/skypilot_base_vm_orchestrator.py,sha256=
|
484
|
-
zenml/integrations/skypilot/orchestrators/skypilot_orchestrator_entrypoint.py,sha256=
|
483
|
+
zenml/integrations/skypilot/orchestrators/skypilot_base_vm_orchestrator.py,sha256=hTLBB7_YPoDVwe9HobPPbz11fXM7w2jjk423qY5IGpc,15580
|
484
|
+
zenml/integrations/skypilot/orchestrators/skypilot_orchestrator_entrypoint.py,sha256=L8ZSbJAwZfFOS0igxw1-LBr7UxspqVyvXrvMQ8AbJbk,11143
|
485
485
|
zenml/integrations/skypilot/orchestrators/skypilot_orchestrator_entrypoint_configuration.py,sha256=wB0QKtDe6a3XyFmoFTwRRdztn7Y6PG-VSI8lHM5n47U,2243
|
486
|
-
zenml/integrations/
|
486
|
+
zenml/integrations/skypilot/utils.py,sha256=YUX9lPqEyPYDV36xEefoQGnyQwu9Kum-gen2LVIgqhs,8680
|
487
|
+
zenml/integrations/skypilot_aws/__init__.py,sha256=U8G0PeQyGcKtQ8NR0YG3CI1K3SxD77_-3s-t4mvFZYA,1722
|
487
488
|
zenml/integrations/skypilot_aws/flavors/__init__.py,sha256=UGL7NuZ2BkB5IxL9WUHmGtlsuUzYw2B-5NPc-ZqGVPY,1006
|
488
489
|
zenml/integrations/skypilot_aws/flavors/skypilot_orchestrator_aws_vm_flavor.py,sha256=PqcUCXRyJoT7jmR15A5d7N5NJxXkgLKWm8BC3YW6Omw,3789
|
489
490
|
zenml/integrations/skypilot_aws/orchestrators/__init__.py,sha256=xxFZ2iM9H9LVKfSB8N-yKGxuB-lJE3Y368lcj7MfhI0,1006
|
490
491
|
zenml/integrations/skypilot_aws/orchestrators/skypilot_aws_vm_orchestrator.py,sha256=0uFfRa8lGsc7D_zdrGGMqKoL6eKUwuf2ImtDw_RRg6Q,3094
|
491
|
-
zenml/integrations/skypilot_azure/__init__.py,sha256=
|
492
|
+
zenml/integrations/skypilot_azure/__init__.py,sha256=WckxTBIZMH1UqRIcoUQCdRRNzABXbKqMO-Y-ty6VnSY,1691
|
492
493
|
zenml/integrations/skypilot_azure/flavors/__init__.py,sha256=9JB07IZbIZhY6yL9TjkYFTCUUfxajMajsMbbjC1BZ_s,1024
|
493
494
|
zenml/integrations/skypilot_azure/flavors/skypilot_orchestrator_azure_vm_flavor.py,sha256=aL8ibfQGo2IcyckaesjAFk6m8kazy5pq6ycArYRD5w0,3858
|
494
495
|
zenml/integrations/skypilot_azure/orchestrators/__init__.py,sha256=gW7JOTLdGB542Cv1ydSqF4kVBAo7LUnfkrlj45kCWm8,1017
|
495
496
|
zenml/integrations/skypilot_azure/orchestrators/skypilot_azure_vm_orchestrator.py,sha256=T0CfFOLRNJZJD_cOaucjckeONZORe_MYhwKOrVGMW3c,2418
|
496
|
-
zenml/integrations/skypilot_gcp/__init__.py,sha256=
|
497
|
+
zenml/integrations/skypilot_gcp/__init__.py,sha256=MhyhPrjy-gfHasyWEUif3RyDJFNGdh6e2Cxe5J5_RVM,2009
|
497
498
|
zenml/integrations/skypilot_gcp/flavors/__init__.py,sha256=cOszr8-BlWpWua9uG-d0Vi3VkUxxq6jl03BD90wZMv4,1006
|
498
499
|
zenml/integrations/skypilot_gcp/flavors/skypilot_orchestrator_gcp_vm_flavor.py,sha256=K2fe63T8zApDnZwuroTle9x_gPW55SUdeJa5JU7ihEo,3946
|
499
500
|
zenml/integrations/skypilot_gcp/orchestrators/__init__.py,sha256=Z6ALEu4PixIo_dKa5Vyo039bNFIOT-3O0MlYFIm6CsY,1007
|
500
501
|
zenml/integrations/skypilot_gcp/orchestrators/skypilot_gcp_vm_orchestrator.py,sha256=c7YQKMileo27mCSyHgSpJ_JU4wPckkXm9o5MUXmOQAA,2516
|
501
|
-
zenml/integrations/skypilot_kubernetes/__init__.py,sha256=
|
502
|
+
zenml/integrations/skypilot_kubernetes/__init__.py,sha256=vGX0w4xKV8TOkWsM1kaJqPGiaprRcKiIwf_2QOOyuJc,1750
|
502
503
|
zenml/integrations/skypilot_kubernetes/flavors/__init__.py,sha256=KNKGT467zLLqFJCJl5uyQJU1738cGLt2iCiTC_e1hgA,1069
|
503
504
|
zenml/integrations/skypilot_kubernetes/flavors/skypilot_orchestrator_kubernetes_vm_flavor.py,sha256=EkhYa7F_Eqk0BC7CAEsgPwzTVB95FXlkXyy2aSLERBE,3997
|
504
505
|
zenml/integrations/skypilot_kubernetes/orchestrators/__init__.py,sha256=26Gf5CHZeT39B7UPt-dSyT9pVMz2SxuBihDAmQ9Pwgo,1041
|
505
506
|
zenml/integrations/skypilot_kubernetes/orchestrators/skypilot_kubernetes_vm_orchestrator.py,sha256=evrkn8JVX87HAb5tuqaoBQXUH5GdK2Botqp7SqVTRbA,2428
|
506
|
-
zenml/integrations/skypilot_lambda/__init__.py,sha256=
|
507
|
+
zenml/integrations/skypilot_lambda/__init__.py,sha256=iH5KRgn8TS5Tw8MdDSQFF4oNR-S8bi2G932ivCS_oKM,1654
|
507
508
|
zenml/integrations/skypilot_lambda/flavors/__init__.py,sha256=2Chuv2ViYTppU4GltvSU2DJ0C6E6IYgPhBor2YkOL5g,1033
|
508
509
|
zenml/integrations/skypilot_lambda/flavors/skypilot_orchestrator_lambda_vm_flavor.py,sha256=INsSw8yP48v9TvTp_lRNV1TwXvK-fPhkLAlmUb6971o,4063
|
509
510
|
zenml/integrations/skypilot_lambda/orchestrators/__init__.py,sha256=3S2SmBfCp-ID78dRXh1QoI1uSxpG0M63Ul8lhDOC3AU,1021
|
@@ -1170,6 +1171,7 @@ zenml/zen_stores/migrations/versions/0.80.2_release.py,sha256=fsM4kc-d82s0xXqUiD
|
|
1170
1171
|
zenml/zen_stores/migrations/versions/0.81.0_release.py,sha256=iiNGlHCVt2IMoQTaN9YIJyHMX3XmGtQVHMxZmyjyD8A,462
|
1171
1172
|
zenml/zen_stores/migrations/versions/0.82.0_release.py,sha256=xBZ8ah1X99l_vUZ8bHC7Pa8caiU8kUhs-SZnGEVAIYY,450
|
1172
1173
|
zenml/zen_stores/migrations/versions/0.82.1_release.py,sha256=OBsyh5W2R-ZBie1_Gw789zSwOXhtgXJiu-_aphomo-E,450
|
1174
|
+
zenml/zen_stores/migrations/versions/0.83.0_release.py,sha256=Y3Pe9I_LJvUgTtehyDC6yLa9mG2QlD0A2ZdXIYF_y8M,450
|
1173
1175
|
zenml/zen_stores/migrations/versions/026d4577b6a0_add_code_path.py,sha256=hXLzvQcylNrbCVD6vha52PFkSPNC2klW9kA0vuQX_cE,1091
|
1174
1176
|
zenml/zen_stores/migrations/versions/03742aa7fdd7_add_secrets.py,sha256=gewKqu1AnzvNTjVvK1eaAwP0hVneWDUyDRSLTvRCdpg,1587
|
1175
1177
|
zenml/zen_stores/migrations/versions/0392807467dc_add_build_duration.py,sha256=YlkDBlfBBv45FsrMO11YcdRn4Maqmlg77t8gWJO4DfA,982
|
@@ -1325,8 +1327,8 @@ zenml/zen_stores/secrets_stores/sql_secrets_store.py,sha256=LPFW757WCJLP1S8vrvjs
|
|
1325
1327
|
zenml/zen_stores/sql_zen_store.py,sha256=TxqVUsE-2NlJ1EasyMfIOtLCPMCtbd19Jgye9b4PlVE,465963
|
1326
1328
|
zenml/zen_stores/template_utils.py,sha256=GbJ7LgGVYHSCKPEA8RNTxPoVTWqpC77F_lGzjJ4O1Fw,9220
|
1327
1329
|
zenml/zen_stores/zen_store_interface.py,sha256=_ap55L3_mrHgegsLkMRSmmNXVasYC53LwjcEeuS1YT4,92411
|
1328
|
-
zenml_nightly-0.
|
1329
|
-
zenml_nightly-0.
|
1330
|
-
zenml_nightly-0.
|
1331
|
-
zenml_nightly-0.
|
1332
|
-
zenml_nightly-0.
|
1330
|
+
zenml_nightly-0.83.0.dev20250529.dist-info/LICENSE,sha256=wbnfEnXnafPbqwANHkV6LUsPKOtdpsd-SNw37rogLtc,11359
|
1331
|
+
zenml_nightly-0.83.0.dev20250529.dist-info/METADATA,sha256=XocABXJAHZ-QogUVVDxsYR_9ElV7GKRgf39NToHbxhY,24317
|
1332
|
+
zenml_nightly-0.83.0.dev20250529.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
1333
|
+
zenml_nightly-0.83.0.dev20250529.dist-info/entry_points.txt,sha256=QK3ETQE0YswAM2mWypNMOv8TLtr7EjnqAFq1br_jEFE,43
|
1334
|
+
zenml_nightly-0.83.0.dev20250529.dist-info/RECORD,,
|
{zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/LICENSE
RENAMED
File without changes
|
{zenml_nightly-0.82.1.dev20250528.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|