skypilot-nightly 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250820__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +5 -3
- sky/backends/cloud_vm_ray_backend.py +6 -13
- sky/backends/wheel_utils.py +2 -1
- sky/catalog/data_fetchers/fetch_aws.py +2 -0
- sky/client/cli/command.py +20 -16
- sky/core.py +1 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-6cb1af4ec7fb1e19.js → 8969-23c8fbdb8b397d59.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-a46c8b62df807ec1.js → webpack-008593a02784a2df.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +6 -1
- sky/global_user_state.py +18 -11
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +43 -34
- sky/jobs/server/utils.py +2 -1
- sky/jobs/utils.py +56 -9
- sky/models.py +1 -0
- sky/provision/aws/config.py +11 -11
- sky/provision/aws/instance.py +30 -27
- sky/provision/do/utils.py +2 -2
- sky/provision/kubernetes/network_utils.py +3 -3
- sky/provision/kubernetes/utils.py +2 -2
- sky/provision/kubernetes/volume.py +2 -0
- sky/provision/provisioner.py +10 -6
- sky/serve/replica_managers.py +7 -0
- sky/serve/server/impl.py +1 -1
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/serializers/encoders.py +29 -5
- sky/server/server.py +37 -1
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +17 -11
- sky/skylet/ray_patches/__init__.py +18 -4
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/utils/common.py +27 -7
- sky/utils/common_utils.py +13 -9
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +3 -0
- sky/utils/kubernetes/gpu_labeler.py +3 -3
- sky/utils/schemas.py +1 -0
- sky/utils/serialize_utils.py +16 -0
- sky/volumes/client/sdk.py +10 -7
- sky/volumes/server/core.py +12 -3
- sky/volumes/volume.py +17 -3
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/METADATA +21 -13
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/RECORD +72 -63
- sky/dashboard/out/_next/static/chunks/3015-471d67c9302d4027.js +0 -1
- /sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
--- a/resource_demand_scheduler.py
|
|
2
|
+
+++ b/resource_demand_scheduler.py
|
|
3
|
+
@@ -1,3 +1,8 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - no new nodes are allowed to be launched launched when the upscaling_speed is 0
|
|
7
|
+
+# - comment out "assert not unfulfilled": this seems a buggy assert
|
|
8
|
+
+
|
|
9
|
+
"""Implements multi-node-type autoscaling.
|
|
10
|
+
|
|
11
|
+
This file implements an autoscaling algorithm that is aware of multiple node
|
|
12
|
+
@@ -448,7 +453,10 @@
|
|
13
|
+
+ placement_group_nodes.get(node_type, 0),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
- if upper_bound > 0:
|
|
17
|
+
+ # NOTE(sky): do not autoscale when upsclaing speed is 0.
|
|
18
|
+
+ if self.upscaling_speed == 0:
|
|
19
|
+
+ upper_bound = 0
|
|
20
|
+
+ if upper_bound >= 0:
|
|
21
|
+
updated_nodes_to_launch[node_type] = min(
|
|
22
|
+
upper_bound, to_launch[node_type]
|
|
23
|
+
)
|
|
24
|
+
@@ -592,7 +600,7 @@
|
|
25
|
+
unfulfilled, including_reserved = get_bin_pack_residual(
|
|
26
|
+
new_node_resources, unfulfilled, strict_spread=True
|
|
27
|
+
)
|
|
28
|
+
- assert not unfulfilled
|
|
29
|
+
+ # assert not unfulfilled # NOTE(sky): buggy assert.
|
|
30
|
+
node_resources += including_reserved
|
|
31
|
+
return to_add, node_resources, node_type_counts
|
|
32
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/updater.py
|
|
2
|
+
+++ b/updater.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - Ensure the node state is refreshed before checking the node is terminated.
|
|
7
|
+
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
@@ -325,6 +329,7 @@
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
time.sleep(READY_CHECK_INTERVAL)
|
|
15
|
+
+ self.provider.non_terminated_nodes({})
|
|
16
|
+
|
|
17
|
+
def do_update(self):
|
|
18
|
+
self.provider.set_node_tags(
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
--- a/worker.py
|
|
2
|
+
+++ b/worker.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
|
|
6
|
+
+# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
|
|
7
|
+
+
|
|
8
|
+
import atexit
|
|
9
|
+
import faulthandler
|
|
10
|
+
import functools
|
|
11
|
+
@@ -2020,6 +2024,14 @@
|
|
12
|
+
pid = data.get("pid")
|
|
13
|
+
lines = data.get("lines", [])
|
|
14
|
+
|
|
15
|
+
+ def end_for(line: str) -> str:
|
|
16
|
+
+ if sys.platform == "win32":
|
|
17
|
+
+ return "\n"
|
|
18
|
+
+ if line.endswith("\r"):
|
|
19
|
+
+ return ""
|
|
20
|
+
+ return "\n"
|
|
21
|
+
+
|
|
22
|
+
+
|
|
23
|
+
if data.get("ip") == data.get("localhost"):
|
|
24
|
+
for line in lines:
|
|
25
|
+
if RAY_TQDM_MAGIC in line:
|
|
26
|
+
@@ -2035,6 +2047,7 @@
|
|
27
|
+
message_for(data, line),
|
|
28
|
+
),
|
|
29
|
+
file=print_file,
|
|
30
|
+
+ end=end_for(line),
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
for line in lines:
|
|
34
|
+
@@ -2052,6 +2065,7 @@
|
|
35
|
+
message_for(data, line),
|
|
36
|
+
),
|
|
37
|
+
file=print_file,
|
|
38
|
+
+ end=end_for(line),
|
|
39
|
+
)
|
|
40
|
+
# Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
|
|
41
|
+
restore_tqdm()
|
sky/utils/common.py
CHANGED
|
@@ -11,18 +11,38 @@ from sky.utils import common_utils
|
|
|
11
11
|
|
|
12
12
|
SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
|
|
13
13
|
JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
|
|
14
|
+
|
|
14
15
|
# We use the user hash (machine-specific) for the controller name. It will be
|
|
15
16
|
# the same across the whole lifecycle of the server, including:
|
|
16
|
-
# 1. all requests, because
|
|
17
|
-
#
|
|
18
|
-
# 2. SkyPilot API server restarts,
|
|
19
|
-
#
|
|
17
|
+
# 1. all requests, because all the server processes share the same user hash
|
|
18
|
+
# cache file.
|
|
19
|
+
# 2. SkyPilot API server restarts, because the API server will restore the
|
|
20
|
+
# user hash from the global user state db on startup.
|
|
21
|
+
# 3. Potential multiple server replicas, because multiple server replicas of
|
|
22
|
+
# a same deployment will share the same global user state db.
|
|
20
23
|
# This behavior is the same for the local API server (where SERVER_ID is the
|
|
21
24
|
# same as the normal user hash). This ensures backwards-compatibility with jobs
|
|
22
25
|
# controllers from before #4660.
|
|
23
|
-
SERVER_ID
|
|
24
|
-
SKY_SERVE_CONTROLLER_NAME: str
|
|
25
|
-
JOB_CONTROLLER_NAME: str
|
|
26
|
+
SERVER_ID: str
|
|
27
|
+
SKY_SERVE_CONTROLLER_NAME: str
|
|
28
|
+
JOB_CONTROLLER_NAME: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def refresh_server_id() -> None:
|
|
32
|
+
"""Refresh the server id.
|
|
33
|
+
|
|
34
|
+
This function is used to ensure the server id is read from the authorative
|
|
35
|
+
source.
|
|
36
|
+
"""
|
|
37
|
+
global SERVER_ID
|
|
38
|
+
global SKY_SERVE_CONTROLLER_NAME
|
|
39
|
+
global JOB_CONTROLLER_NAME
|
|
40
|
+
SERVER_ID = common_utils.get_user_hash()
|
|
41
|
+
SKY_SERVE_CONTROLLER_NAME = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
42
|
+
JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
refresh_server_id()
|
|
26
46
|
|
|
27
47
|
|
|
28
48
|
@contextlib.contextmanager
|
sky/utils/common_utils.py
CHANGED
|
@@ -28,7 +28,6 @@ from sky.adaptors import common as adaptors_common
|
|
|
28
28
|
from sky.skylet import constants
|
|
29
29
|
from sky.usage import constants as usage_constants
|
|
30
30
|
from sky.utils import annotations
|
|
31
|
-
from sky.utils import common_utils
|
|
32
31
|
from sky.utils import ux_utils
|
|
33
32
|
from sky.utils import validator
|
|
34
33
|
|
|
@@ -41,7 +40,7 @@ else:
|
|
|
41
40
|
psutil = adaptors_common.LazyImport('psutil')
|
|
42
41
|
yaml = adaptors_common.LazyImport('yaml')
|
|
43
42
|
|
|
44
|
-
|
|
43
|
+
USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
|
|
45
44
|
USER_HASH_LENGTH = 8
|
|
46
45
|
|
|
47
46
|
# We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
|
|
@@ -131,21 +130,26 @@ def get_user_hash() -> str:
|
|
|
131
130
|
assert user_hash is not None
|
|
132
131
|
return user_hash
|
|
133
132
|
|
|
134
|
-
if os.path.exists(
|
|
133
|
+
if os.path.exists(USER_HASH_FILE):
|
|
135
134
|
# Read from cached user hash file.
|
|
136
|
-
with open(
|
|
135
|
+
with open(USER_HASH_FILE, 'r', encoding='utf-8') as f:
|
|
137
136
|
# Remove invalid characters.
|
|
138
137
|
user_hash = f.read().strip()
|
|
139
138
|
if is_valid_user_hash(user_hash):
|
|
140
139
|
return user_hash
|
|
141
140
|
|
|
142
141
|
user_hash = generate_user_hash()
|
|
143
|
-
|
|
144
|
-
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
|
145
|
-
f.write(user_hash)
|
|
142
|
+
set_user_hash_locally(user_hash)
|
|
146
143
|
return user_hash
|
|
147
144
|
|
|
148
145
|
|
|
146
|
+
def set_user_hash_locally(user_hash: str) -> None:
|
|
147
|
+
"""Sets the user hash to local file."""
|
|
148
|
+
os.makedirs(os.path.dirname(USER_HASH_FILE), exist_ok=True)
|
|
149
|
+
with open(USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
|
150
|
+
f.write(user_hash)
|
|
151
|
+
|
|
152
|
+
|
|
149
153
|
def base36_encode(hex_str: str) -> str:
|
|
150
154
|
"""Converts a hex string to a base36 string."""
|
|
151
155
|
int_value = int(hex_str, 16)
|
|
@@ -343,7 +347,7 @@ def get_current_user() -> 'models.User':
|
|
|
343
347
|
|
|
344
348
|
def get_current_user_name() -> str:
|
|
345
349
|
"""Returns the current user name."""
|
|
346
|
-
name =
|
|
350
|
+
name = get_current_user().name
|
|
347
351
|
assert name is not None
|
|
348
352
|
return name
|
|
349
353
|
|
|
@@ -886,7 +890,7 @@ def get_cleaned_username(username: str = '') -> str:
|
|
|
886
890
|
Returns:
|
|
887
891
|
A cleaned username.
|
|
888
892
|
"""
|
|
889
|
-
username = username or
|
|
893
|
+
username = username or get_current_user_name()
|
|
890
894
|
username = username.lower()
|
|
891
895
|
username = re.sub(r'[^a-z0-9-_]', '', username)
|
|
892
896
|
username = re.sub(r'^[0-9-]+', '', username)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Directory utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# This file is in '<project_root>/sky/utils/directory_utils.py'
|
|
6
|
+
# So we need to go up 2 levels to get to the '<project_root>/sky' directory
|
|
7
|
+
SKY_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_sky_dir():
|
|
11
|
+
"""Get the sky root directory."""
|
|
12
|
+
return SKY_DIR
|
sky/utils/env_options.py
CHANGED
|
@@ -24,6 +24,9 @@ class Options(enum.Enum):
|
|
|
24
24
|
# running in a Buildkite container environment, which requires special
|
|
25
25
|
# handling for networking between containers.
|
|
26
26
|
RUNNING_IN_BUILDKITE = ('BUILDKITE', False)
|
|
27
|
+
# Internal: This is used for testing to enable grpc for communication
|
|
28
|
+
# between the API server and the Skylet.
|
|
29
|
+
ENABLE_GRPC = ('SKYPILOT_ENABLE_GRPC', False)
|
|
27
30
|
|
|
28
31
|
def __init__(self, env_var: str, default: bool) -> None:
|
|
29
32
|
super().__init__()
|
|
@@ -8,9 +8,9 @@ from typing import Dict, Optional, Tuple
|
|
|
8
8
|
import colorama
|
|
9
9
|
import yaml
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky.adaptors import kubernetes
|
|
13
12
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import directory_utils
|
|
14
14
|
from sky.utils import rich_utils
|
|
15
15
|
|
|
16
16
|
|
|
@@ -71,8 +71,8 @@ def label(context: Optional[str] = None, wait_for_completion: bool = True):
|
|
|
71
71
|
f'Found {len(unlabeled_gpu_nodes)} '
|
|
72
72
|
'unlabeled GPU nodes in the cluster', colorama.Fore.YELLOW))
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
manifest_dir = os.path.join(directory_utils.get_sky_dir(),
|
|
75
|
+
'utils/kubernetes')
|
|
76
76
|
|
|
77
77
|
# Apply the RBAC manifest using kubectl since it contains multiple resources
|
|
78
78
|
with rich_utils.client_status('Setting up GPU labeling'):
|
sky/utils/schemas.py
CHANGED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Utilities for handling resource handles."""
|
|
2
|
+
import copy
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def prepare_handle_for_backwards_compatibility(
|
|
7
|
+
handle: typing.Any) -> typing.Any:
|
|
8
|
+
"""Prepare a handle for backwards compatibility with older clients."""
|
|
9
|
+
# skylet_ssh_tunnel was causing backwards compatibility issues with older
|
|
10
|
+
# clients: AttributeError: Can't get attribute 'SSHTunnelInfo'
|
|
11
|
+
#
|
|
12
|
+
# But it is not needed on the client side, so we can just remove it.
|
|
13
|
+
if handle is not None and hasattr(handle, 'skylet_ssh_tunnel'):
|
|
14
|
+
handle = copy.deepcopy(handle)
|
|
15
|
+
handle.skylet_ssh_tunnel = None
|
|
16
|
+
return handle
|
sky/volumes/client/sdk.py
CHANGED
|
@@ -33,13 +33,16 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
|
|
|
33
33
|
Returns:
|
|
34
34
|
The request ID of the apply request.
|
|
35
35
|
"""
|
|
36
|
-
body = payloads.VolumeApplyBody(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
body = payloads.VolumeApplyBody(
|
|
37
|
+
name=volume.name,
|
|
38
|
+
volume_type=volume.type,
|
|
39
|
+
cloud=volume.cloud,
|
|
40
|
+
region=volume.region,
|
|
41
|
+
zone=volume.zone,
|
|
42
|
+
size=volume.size,
|
|
43
|
+
config=volume.config,
|
|
44
|
+
labels=volume.labels,
|
|
45
|
+
)
|
|
43
46
|
response = requests.post(f'{server_common.get_server_url()}/volumes/apply',
|
|
44
47
|
json=json.loads(body.model_dump_json()),
|
|
45
48
|
cookies=server_common.get_api_cookie_jar())
|
sky/volumes/server/core.py
CHANGED
|
@@ -162,9 +162,16 @@ def volume_delete(names: List[str]) -> None:
|
|
|
162
162
|
global_user_state.delete_volume(name)
|
|
163
163
|
|
|
164
164
|
|
|
165
|
-
def volume_apply(
|
|
166
|
-
|
|
167
|
-
|
|
165
|
+
def volume_apply(
|
|
166
|
+
name: str,
|
|
167
|
+
volume_type: str,
|
|
168
|
+
cloud: str,
|
|
169
|
+
region: Optional[str],
|
|
170
|
+
zone: Optional[str],
|
|
171
|
+
size: Optional[str],
|
|
172
|
+
config: Dict[str, Any],
|
|
173
|
+
labels: Optional[Dict[str, str]] = None,
|
|
174
|
+
) -> None:
|
|
168
175
|
"""Creates or registers a volume.
|
|
169
176
|
|
|
170
177
|
Args:
|
|
@@ -175,6 +182,7 @@ def volume_apply(name: str, volume_type: str, cloud: str, region: Optional[str],
|
|
|
175
182
|
zone: The zone of the volume.
|
|
176
183
|
size: The size of the volume.
|
|
177
184
|
config: The configuration of the volume.
|
|
185
|
+
labels: The labels of the volume.
|
|
178
186
|
|
|
179
187
|
"""
|
|
180
188
|
with rich_utils.safe_status(ux_utils.spinner_message('Creating volume')):
|
|
@@ -195,6 +203,7 @@ def volume_apply(name: str, volume_type: str, cloud: str, region: Optional[str],
|
|
|
195
203
|
size=size,
|
|
196
204
|
config=config,
|
|
197
205
|
name_on_cloud=name_on_cloud,
|
|
206
|
+
labels=labels,
|
|
198
207
|
)
|
|
199
208
|
logger.debug(
|
|
200
209
|
f'Creating volume {name} on cloud {cloud} with config {config}')
|
sky/volumes/volume.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from sky.utils import common_utils
|
|
5
5
|
from sky.utils import infra_utils
|
|
6
|
+
from sky.utils import registry
|
|
6
7
|
from sky.utils import resources_utils
|
|
7
8
|
from sky.utils import schemas
|
|
8
9
|
|
|
@@ -16,6 +17,7 @@ class Volume:
|
|
|
16
17
|
type: Optional[str] = None, # pylint: disable=redefined-builtin
|
|
17
18
|
infra: Optional[str] = None,
|
|
18
19
|
size: Optional[str] = None,
|
|
20
|
+
labels: Optional[Dict[str, str]] = None,
|
|
19
21
|
resource_name: Optional[str] = None,
|
|
20
22
|
config: Optional[Dict[str, Any]] = None):
|
|
21
23
|
"""Initialize a Volume instance.
|
|
@@ -25,12 +27,14 @@ class Volume:
|
|
|
25
27
|
type: Volume type (e.g., 'k8s-pvc')
|
|
26
28
|
infra: Infrastructure specification
|
|
27
29
|
size: Volume size
|
|
30
|
+
labels: Volume labels
|
|
28
31
|
config: Additional configuration
|
|
29
32
|
"""
|
|
30
33
|
self.name = name
|
|
31
34
|
self.type = type
|
|
32
35
|
self.infra = infra
|
|
33
36
|
self.size = size
|
|
37
|
+
self.labels = labels or {}
|
|
34
38
|
self.resource_name = resource_name
|
|
35
39
|
self.config = config or {}
|
|
36
40
|
|
|
@@ -45,6 +49,7 @@ class Volume:
|
|
|
45
49
|
type=config_dict.get('type'),
|
|
46
50
|
infra=config_dict.get('infra'),
|
|
47
51
|
size=config_dict.get('size'),
|
|
52
|
+
labels=config_dict.get('labels'),
|
|
48
53
|
resource_name=config_dict.get('resource_name'),
|
|
49
54
|
config=config_dict.get('config', {}))
|
|
50
55
|
|
|
@@ -55,6 +60,7 @@ class Volume:
|
|
|
55
60
|
'type': self.type,
|
|
56
61
|
'infra': self.infra,
|
|
57
62
|
'size': self.size,
|
|
63
|
+
'labels': self.labels,
|
|
58
64
|
'resource_name': self.resource_name,
|
|
59
65
|
'config': self.config,
|
|
60
66
|
'cloud': self.cloud,
|
|
@@ -94,15 +100,15 @@ class Volume:
|
|
|
94
100
|
# Adjust the volume config (e.g., parse size)
|
|
95
101
|
self._adjust_config()
|
|
96
102
|
|
|
97
|
-
# Validate the volume config
|
|
98
|
-
self._validate_config()
|
|
99
|
-
|
|
100
103
|
# Resolve the infrastructure options to cloud, region, zone
|
|
101
104
|
infra_info = infra_utils.InfraInfo.from_str(self.infra)
|
|
102
105
|
self.cloud = infra_info.cloud
|
|
103
106
|
self.region = infra_info.region
|
|
104
107
|
self.zone = infra_info.zone
|
|
105
108
|
|
|
109
|
+
# Validate the volume config
|
|
110
|
+
self._validate_config()
|
|
111
|
+
|
|
106
112
|
def _adjust_config(self) -> None:
|
|
107
113
|
"""Adjust the volume config (e.g., parse size)."""
|
|
108
114
|
if self.size is None:
|
|
@@ -123,3 +129,11 @@ class Volume:
|
|
|
123
129
|
raise ValueError('Size is required for new volumes. '
|
|
124
130
|
'Please specify the size in the YAML file or '
|
|
125
131
|
'use the --size flag.')
|
|
132
|
+
if self.labels:
|
|
133
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
134
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
135
|
+
assert cloud_obj is not None
|
|
136
|
+
for key, value in self.labels.items():
|
|
137
|
+
valid, err_msg = cloud_obj.is_label_valid(key, value)
|
|
138
|
+
if not valid:
|
|
139
|
+
raise ValueError(f'{err_msg}')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250820
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -133,6 +133,8 @@ Requires-Dist: sqlalchemy_adapter; extra == "server"
|
|
|
133
133
|
Requires-Dist: passlib; extra == "server"
|
|
134
134
|
Requires-Dist: pyjwt; extra == "server"
|
|
135
135
|
Requires-Dist: aiohttp; extra == "server"
|
|
136
|
+
Requires-Dist: grpcio>=1.63.0; extra == "server"
|
|
137
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
136
138
|
Provides-Extra: all
|
|
137
139
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
138
140
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
@@ -187,6 +189,8 @@ Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
|
187
189
|
Requires-Dist: passlib; extra == "all"
|
|
188
190
|
Requires-Dist: pyjwt; extra == "all"
|
|
189
191
|
Requires-Dist: aiohttp; extra == "all"
|
|
192
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
193
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
190
194
|
Dynamic: author
|
|
191
195
|
Dynamic: classifier
|
|
192
196
|
Dynamic: description
|
|
@@ -222,7 +226,7 @@ Dynamic: summary
|
|
|
222
226
|
</p>
|
|
223
227
|
|
|
224
228
|
<h3 align="center">
|
|
225
|
-
|
|
229
|
+
Simplify & scale any AI infrastructure
|
|
226
230
|
</h3>
|
|
227
231
|
|
|
228
232
|
<div align="center">
|
|
@@ -242,26 +246,28 @@ Dynamic: summary
|
|
|
242
246
|
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
|
|
243
247
|
- [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
|
|
244
248
|
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
|
|
245
|
-
- [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
|
|
246
249
|
- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
|
|
247
|
-
- [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
|
|
248
|
-
- [Feb 2025] Prepare and serve large-scale image search with **vector databases**: [**blog post**](https://blog.skypilot.co/large-scale-vector-database/), [**example**](./examples/vector_database/)
|
|
249
|
-
- [Jan 2025] Launch and serve distilled models from **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or any cloud: [**R1 example**](./llm/deepseek-r1-distilled/) and [**Janus example**](./llm/deepseek-janus/)
|
|
250
|
-
- [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
|
|
251
250
|
|
|
252
251
|
|
|
253
252
|
**LLM Finetuning Cookbooks**: Finetuning Llama 2 / Llama 3.1 in your own cloud environment, privately: Llama 2 [**example**](./llm/vicuna-llama-2/) and [**blog**](https://blog.skypilot.co/finetuning-llama2-operational-guide/); Llama 3.1 [**example**](./llm/llama-3_1-finetuning/) and [**blog**](https://blog.skypilot.co/finetune-llama-3_1-on-your-infra/)
|
|
254
253
|
|
|
255
254
|
----
|
|
256
255
|
|
|
257
|
-
SkyPilot is
|
|
256
|
+
SkyPilot is a system for running, managing, and scaling AI workloads on any AI infrastructure.
|
|
257
|
+
|
|
258
|
+
SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
|
|
259
|
+
**Infra teams** get a unified control plane to manage any AI compute — with advanced scheduling, scaling, and orchestration.
|
|
258
260
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
+

|
|
262
|
+
|
|
263
|
+
## Overview
|
|
264
|
+
|
|
265
|
+
SkyPilot **is easy to use for AI teams**:
|
|
266
|
+
- Quickly spin up compute on your own infra
|
|
261
267
|
- Environment and job as code — simple and portable
|
|
262
|
-
- Easy management: queue, run, and auto-recover many jobs
|
|
268
|
+
- Easy job management: queue, run, and auto-recover many jobs
|
|
263
269
|
|
|
264
|
-
SkyPilot **makes Kubernetes easy for AI teams**:
|
|
270
|
+
SkyPilot **makes Kubernetes easy for AI & Infra teams**:
|
|
265
271
|
|
|
266
272
|
- Slurm-like ease of use, cloud-native robustness
|
|
267
273
|
- Local dev experience on K8s: SSH into pods, sync code, or connect IDE
|
|
@@ -378,7 +384,9 @@ Source files can be found in [`llm/`](https://github.com/skypilot-org/skypilot/t
|
|
|
378
384
|
## More information
|
|
379
385
|
To learn more, see [SkyPilot Overview](https://docs.skypilot.co/en/latest/overview.html), [SkyPilot docs](https://docs.skypilot.co/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/).
|
|
380
386
|
|
|
381
|
-
|
|
387
|
+
SkyPilot adopters: [Testimonials and Case Studies](https://blog.skypilot.co/case-studies/)
|
|
388
|
+
|
|
389
|
+
Partners and integrations: [Community Spotlights](https://blog.skypilot.co/community/)
|
|
382
390
|
|
|
383
391
|
Follow updates:
|
|
384
392
|
- [Slack](http://slack.skypilot.co)
|