skypilot-nightly 1.0.0.dev20250819__py3-none-any.whl → 1.0.0.dev20250821__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +5 -3
- sky/backends/cloud_vm_ray_backend.py +15 -14
- sky/backends/wheel_utils.py +2 -1
- sky/client/cli/command.py +20 -16
- sky/client/cli/flags.py +3 -3
- sky/core.py +1 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-6c9c09593b1e67b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.bc5d2853355c9c47.js → 3785.d5b86f6ebc88e6e6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{9277.71481d5b2e606e33.js → 4783.c485f48348349f47.js} +8 -3
- sky/dashboard/out/_next/static/chunks/{6633-efe924b9b8136699.js → 7205-88191679e7988c57.js} +9 -4
- sky/dashboard/out/_next/static/chunks/8969-4a6f1a928fb6d370.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8838.e7953f42af2b0544.js → 9946.3b7b43c217ff70ec.js} +9 -4
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-ec747e4f2dc39b57.js → [cluster]-a0527109c2fab467.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-4b3ba1792dc6f21d.js → jobs-7421e63ac35f8fce.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-65f72dee417237ef.js → [name]-de06e613e20bc977.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-338de9df523d883a.js → workspaces-be35b22e2046564c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-6e76f636a048e145.js +1 -0
- sky/dashboard/out/_next/static/{tYn7R2be3cQPYJfTxxE09 → wN25tc2rkvOkO-qkzIhcD}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +6 -1
- sky/global_user_state.py +18 -11
- sky/jobs/server/core.py +1 -1
- sky/models.py +1 -0
- sky/provision/aws/config.py +11 -11
- sky/provision/aws/instance.py +30 -27
- sky/provision/do/utils.py +2 -2
- sky/provision/docker_utils.py +20 -1
- sky/provision/kubernetes/network_utils.py +3 -3
- sky/provision/kubernetes/utils.py +2 -2
- sky/provision/kubernetes/volume.py +2 -0
- sky/resources.py +17 -7
- sky/serve/replica_managers.py +7 -0
- sky/serve/server/impl.py +1 -1
- sky/server/requests/payloads.py +1 -0
- sky/server/requests/serializers/encoders.py +14 -2
- sky/server/server.py +33 -0
- sky/setup_files/dependencies.py +17 -11
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +2 -1
- sky/utils/common.py +27 -7
- sky/utils/common_utils.py +13 -9
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +3 -0
- sky/utils/kubernetes/gpu_labeler.py +3 -3
- sky/utils/schemas.py +1 -0
- sky/utils/serialize_utils.py +16 -0
- sky/volumes/client/sdk.py +10 -7
- sky/volumes/server/core.py +12 -3
- sky/volumes/volume.py +17 -3
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/METADATA +21 -13
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/RECORD +76 -74
- sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-23c8fbdb8b397d59.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +0 -6
- sky/dashboard/out/_next/static/chunks/webpack-008593a02784a2df.js +0 -1
- /sky/dashboard/out/_next/static/chunks/{1121-2edb8ab2ba080a76.js → 1121-8afcf719ea87debc.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-2f60a90b7d76838e.js → 1141-943efc7aff0f0c06.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6856-e6f350f567182e87.js → 6856-049014c6d43d127b.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-7d4182df6625fe10.js → [pool]-07349868f7905d37.js} +0 -0
- /sky/dashboard/out/_next/static/{tYn7R2be3cQPYJfTxxE09 → wN25tc2rkvOkO-qkzIhcD}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250819.dist-info → skypilot_nightly-1.0.0.dev20250821.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
10
10
|
|
|
11
11
|
from sky.schemas.api import responses
|
|
12
12
|
from sky.server import constants as server_constants
|
|
13
|
+
from sky.utils import serialize_utils
|
|
13
14
|
|
|
14
15
|
if typing.TYPE_CHECKING:
|
|
15
16
|
from sky import backends
|
|
@@ -22,6 +23,9 @@ handlers: Dict[str, Any] = {}
|
|
|
22
23
|
|
|
23
24
|
def pickle_and_encode(obj: Any) -> str:
|
|
24
25
|
try:
|
|
26
|
+
# Apply backwards compatibility processing at the lowest level
|
|
27
|
+
# to catch any handles that might have bypassed the encoders
|
|
28
|
+
obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
|
|
25
29
|
return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
|
|
26
30
|
except TypeError as e:
|
|
27
31
|
raise ValueError(f'Failed to pickle object: {obj}') from e
|
|
@@ -58,7 +62,9 @@ def encode_status(
|
|
|
58
62
|
for cluster in clusters:
|
|
59
63
|
response_cluster = cluster.model_dump()
|
|
60
64
|
response_cluster['status'] = cluster['status'].value
|
|
61
|
-
|
|
65
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
|
+
cluster['handle'])
|
|
67
|
+
response_cluster['handle'] = pickle_and_encode(handle)
|
|
62
68
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
63
69
|
response_cluster['storage_mounts_metadata'])
|
|
64
70
|
response.append(response_cluster)
|
|
@@ -70,6 +76,7 @@ def encode_launch(
|
|
|
70
76
|
job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
|
|
71
77
|
) -> Dict[str, Any]:
|
|
72
78
|
job_id, handle = job_id_handle
|
|
79
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
|
|
73
80
|
return {
|
|
74
81
|
'job_id': job_id,
|
|
75
82
|
'handle': pickle_and_encode(handle),
|
|
@@ -78,6 +85,9 @@ def encode_launch(
|
|
|
78
85
|
|
|
79
86
|
@register_encoder('start')
|
|
80
87
|
def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
88
|
+
resource_handle = (
|
|
89
|
+
serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
90
|
+
resource_handle))
|
|
81
91
|
return pickle_and_encode(resource_handle)
|
|
82
92
|
|
|
83
93
|
|
|
@@ -143,7 +153,9 @@ def _encode_serve_status(
|
|
|
143
153
|
service_status['status'] = service_status['status'].value
|
|
144
154
|
for replica_info in service_status.get('replica_info', []):
|
|
145
155
|
replica_info['status'] = replica_info['status'].value
|
|
146
|
-
|
|
156
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
157
|
+
replica_info['handle'])
|
|
158
|
+
replica_info['handle'] = pickle_and_encode(handle)
|
|
147
159
|
return service_statuses
|
|
148
160
|
|
|
149
161
|
|
sky/server/server.py
CHANGED
|
@@ -83,6 +83,8 @@ else:
|
|
|
83
83
|
|
|
84
84
|
P = ParamSpec('P')
|
|
85
85
|
|
|
86
|
+
_SERVER_USER_HASH_KEY = 'server_user_hash'
|
|
87
|
+
|
|
86
88
|
|
|
87
89
|
def _add_timestamp_prefix_for_server_logs() -> None:
|
|
88
90
|
server_logger = sky_logging.init_logger('sky.server')
|
|
@@ -1821,6 +1823,35 @@ async def root():
|
|
|
1821
1823
|
return fastapi.responses.RedirectResponse(url='/dashboard/')
|
|
1822
1824
|
|
|
1823
1825
|
|
|
1826
|
+
def _init_or_restore_server_user_hash():
|
|
1827
|
+
"""Restores the server user hash from the global user state db.
|
|
1828
|
+
|
|
1829
|
+
The API server must have a stable user hash across restarts and potential
|
|
1830
|
+
multiple replicas. Thus we persist the user hash in db and restore it on
|
|
1831
|
+
startup. When upgrading from old version, the user hash will be read from
|
|
1832
|
+
the local file (if any) to keep the user hash consistent.
|
|
1833
|
+
"""
|
|
1834
|
+
|
|
1835
|
+
def apply_user_hash(user_hash: str) -> None:
|
|
1836
|
+
# For local API server, the user hash in db and local file should be
|
|
1837
|
+
# same so there is no harm to override here.
|
|
1838
|
+
common_utils.set_user_hash_locally(user_hash)
|
|
1839
|
+
# Refresh the server user hash for current process after restore or
|
|
1840
|
+
# initialize the user hash in db, child processes will get the correct
|
|
1841
|
+
# server id from the local cache file.
|
|
1842
|
+
common_lib.refresh_server_id()
|
|
1843
|
+
|
|
1844
|
+
user_hash = global_user_state.get_system_config(_SERVER_USER_HASH_KEY)
|
|
1845
|
+
if user_hash is not None:
|
|
1846
|
+
apply_user_hash(user_hash)
|
|
1847
|
+
return
|
|
1848
|
+
|
|
1849
|
+
# Initial deployment, generate a user hash and save it to the db.
|
|
1850
|
+
user_hash = common_utils.get_user_hash()
|
|
1851
|
+
global_user_state.set_system_config(_SERVER_USER_HASH_KEY, user_hash)
|
|
1852
|
+
apply_user_hash(user_hash)
|
|
1853
|
+
|
|
1854
|
+
|
|
1824
1855
|
if __name__ == '__main__':
|
|
1825
1856
|
import uvicorn
|
|
1826
1857
|
|
|
@@ -1830,6 +1861,8 @@ if __name__ == '__main__':
|
|
|
1830
1861
|
global_user_state.initialize_and_get_db()
|
|
1831
1862
|
# Initialize request db
|
|
1832
1863
|
requests_lib.reset_db_and_logs()
|
|
1864
|
+
# Restore the server user hash
|
|
1865
|
+
_init_or_restore_server_user_hash()
|
|
1833
1866
|
|
|
1834
1867
|
parser = argparse.ArgumentParser()
|
|
1835
1868
|
parser.add_argument('--host', default='127.0.0.1')
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -72,12 +72,27 @@ install_requires = [
|
|
|
72
72
|
'aiohttp',
|
|
73
73
|
]
|
|
74
74
|
|
|
75
|
+
# See requirements-dev.txt for the version of grpc and protobuf
|
|
76
|
+
# used to generate the code during development.
|
|
77
|
+
|
|
78
|
+
# The grpc version at runtime has to be newer than the version
|
|
79
|
+
# used to generate the code.
|
|
80
|
+
GRPC = 'grpcio>=1.63.0'
|
|
81
|
+
# >= 5.26.1 because the runtime version can't be older than the version
|
|
82
|
+
# used to generate the code.
|
|
83
|
+
# < 7.0.0 because code generated for a major version V will be supported by
|
|
84
|
+
# protobuf runtimes of version V and V+1.
|
|
85
|
+
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
86
|
+
PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
|
|
87
|
+
|
|
75
88
|
server_dependencies = [
|
|
76
89
|
'casbin',
|
|
77
90
|
'sqlalchemy_adapter',
|
|
78
91
|
'passlib',
|
|
79
92
|
'pyjwt',
|
|
80
93
|
'aiohttp',
|
|
94
|
+
GRPC,
|
|
95
|
+
PROTOBUF,
|
|
81
96
|
]
|
|
82
97
|
|
|
83
98
|
local_ray = [
|
|
@@ -88,18 +103,9 @@ local_ray = [
|
|
|
88
103
|
'ray[default] >= 2.2.0, != 2.6.0',
|
|
89
104
|
]
|
|
90
105
|
|
|
91
|
-
# See requirements-dev.txt for the version of grpc and protobuf
|
|
92
|
-
# used to generate the code during development.
|
|
93
106
|
remote = [
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
'grpcio>=1.63.0',
|
|
97
|
-
# >= 5.26.1 because the runtime version can't be older than the version
|
|
98
|
-
# used to generate the code.
|
|
99
|
-
# < 7.0.0 because code generated for a major version V will be supported by
|
|
100
|
-
# protobuf runtimes of version V and V+1.
|
|
101
|
-
# https://protobuf.dev/support/cross-version-runtime-guarantee
|
|
102
|
-
'protobuf >= 5.26.1, < 7.0.0',
|
|
107
|
+
GRPC,
|
|
108
|
+
PROTOBUF,
|
|
103
109
|
]
|
|
104
110
|
|
|
105
111
|
# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
|
sky/skypilot_config.py
CHANGED
|
@@ -514,10 +514,10 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
|
|
514
514
|
|
|
515
515
|
|
|
516
516
|
def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
|
|
517
|
-
"""Parse a
|
|
517
|
+
"""Parse a single key-value pair into a dictionary.
|
|
518
518
|
|
|
519
519
|
Args:
|
|
520
|
-
dotlist: A
|
|
520
|
+
dotlist: A single key-value pair.
|
|
521
521
|
|
|
522
522
|
Returns:
|
|
523
523
|
A config_utils.Config object with the parsed key-value pairs.
|
|
@@ -788,7 +788,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
|
788
788
|
"""Composes the skypilot CLI config.
|
|
789
789
|
CLI config can either be:
|
|
790
790
|
- A path to a config file
|
|
791
|
-
- A
|
|
791
|
+
- A single key-value pair
|
|
792
792
|
"""
|
|
793
793
|
|
|
794
794
|
if not cli_config:
|
|
@@ -804,7 +804,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
|
804
804
|
config_source = maybe_config_path
|
|
805
805
|
# cli_config is a path to a config file
|
|
806
806
|
parsed_config = parse_and_validate_config_file(maybe_config_path)
|
|
807
|
-
else: # cli_config is a
|
|
807
|
+
else: # cli_config is a single key-value pair
|
|
808
808
|
parsed_config = _parse_dotlist(cli_config)
|
|
809
809
|
_validate_config(parsed_config, config_source)
|
|
810
810
|
except ValueError as e:
|
sky/users/permission.py
CHANGED
|
@@ -46,7 +46,8 @@ class PermissionService:
|
|
|
46
46
|
engine = global_user_state.initialize_and_get_db()
|
|
47
47
|
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
48
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
49
|
-
adapter = sqlalchemy_adapter.Adapter(
|
|
49
|
+
adapter = sqlalchemy_adapter.Adapter(
|
|
50
|
+
engine, db_class=sqlalchemy_adapter.CasbinRule)
|
|
50
51
|
model_path = os.path.join(os.path.dirname(__file__),
|
|
51
52
|
'model.conf')
|
|
52
53
|
enforcer = casbin.Enforcer(model_path, adapter)
|
sky/utils/common.py
CHANGED
|
@@ -11,18 +11,38 @@ from sky.utils import common_utils
|
|
|
11
11
|
|
|
12
12
|
SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
|
|
13
13
|
JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
|
|
14
|
+
|
|
14
15
|
# We use the user hash (machine-specific) for the controller name. It will be
|
|
15
16
|
# the same across the whole lifecycle of the server, including:
|
|
16
|
-
# 1. all requests, because
|
|
17
|
-
#
|
|
18
|
-
# 2. SkyPilot API server restarts,
|
|
19
|
-
#
|
|
17
|
+
# 1. all requests, because all the server processes share the same user hash
|
|
18
|
+
# cache file.
|
|
19
|
+
# 2. SkyPilot API server restarts, because the API server will restore the
|
|
20
|
+
# user hash from the global user state db on startup.
|
|
21
|
+
# 3. Potential multiple server replicas, because multiple server replicas of
|
|
22
|
+
# a same deployment will share the same global user state db.
|
|
20
23
|
# This behavior is the same for the local API server (where SERVER_ID is the
|
|
21
24
|
# same as the normal user hash). This ensures backwards-compatibility with jobs
|
|
22
25
|
# controllers from before #4660.
|
|
23
|
-
SERVER_ID
|
|
24
|
-
SKY_SERVE_CONTROLLER_NAME: str
|
|
25
|
-
JOB_CONTROLLER_NAME: str
|
|
26
|
+
SERVER_ID: str
|
|
27
|
+
SKY_SERVE_CONTROLLER_NAME: str
|
|
28
|
+
JOB_CONTROLLER_NAME: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def refresh_server_id() -> None:
|
|
32
|
+
"""Refresh the server id.
|
|
33
|
+
|
|
34
|
+
This function is used to ensure the server id is read from the authorative
|
|
35
|
+
source.
|
|
36
|
+
"""
|
|
37
|
+
global SERVER_ID
|
|
38
|
+
global SKY_SERVE_CONTROLLER_NAME
|
|
39
|
+
global JOB_CONTROLLER_NAME
|
|
40
|
+
SERVER_ID = common_utils.get_user_hash()
|
|
41
|
+
SKY_SERVE_CONTROLLER_NAME = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
42
|
+
JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
refresh_server_id()
|
|
26
46
|
|
|
27
47
|
|
|
28
48
|
@contextlib.contextmanager
|
sky/utils/common_utils.py
CHANGED
|
@@ -28,7 +28,6 @@ from sky.adaptors import common as adaptors_common
|
|
|
28
28
|
from sky.skylet import constants
|
|
29
29
|
from sky.usage import constants as usage_constants
|
|
30
30
|
from sky.utils import annotations
|
|
31
|
-
from sky.utils import common_utils
|
|
32
31
|
from sky.utils import ux_utils
|
|
33
32
|
from sky.utils import validator
|
|
34
33
|
|
|
@@ -41,7 +40,7 @@ else:
|
|
|
41
40
|
psutil = adaptors_common.LazyImport('psutil')
|
|
42
41
|
yaml = adaptors_common.LazyImport('yaml')
|
|
43
42
|
|
|
44
|
-
|
|
43
|
+
USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
|
|
45
44
|
USER_HASH_LENGTH = 8
|
|
46
45
|
|
|
47
46
|
# We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
|
|
@@ -131,21 +130,26 @@ def get_user_hash() -> str:
|
|
|
131
130
|
assert user_hash is not None
|
|
132
131
|
return user_hash
|
|
133
132
|
|
|
134
|
-
if os.path.exists(
|
|
133
|
+
if os.path.exists(USER_HASH_FILE):
|
|
135
134
|
# Read from cached user hash file.
|
|
136
|
-
with open(
|
|
135
|
+
with open(USER_HASH_FILE, 'r', encoding='utf-8') as f:
|
|
137
136
|
# Remove invalid characters.
|
|
138
137
|
user_hash = f.read().strip()
|
|
139
138
|
if is_valid_user_hash(user_hash):
|
|
140
139
|
return user_hash
|
|
141
140
|
|
|
142
141
|
user_hash = generate_user_hash()
|
|
143
|
-
|
|
144
|
-
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
|
145
|
-
f.write(user_hash)
|
|
142
|
+
set_user_hash_locally(user_hash)
|
|
146
143
|
return user_hash
|
|
147
144
|
|
|
148
145
|
|
|
146
|
+
def set_user_hash_locally(user_hash: str) -> None:
|
|
147
|
+
"""Sets the user hash to local file."""
|
|
148
|
+
os.makedirs(os.path.dirname(USER_HASH_FILE), exist_ok=True)
|
|
149
|
+
with open(USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
|
150
|
+
f.write(user_hash)
|
|
151
|
+
|
|
152
|
+
|
|
149
153
|
def base36_encode(hex_str: str) -> str:
|
|
150
154
|
"""Converts a hex string to a base36 string."""
|
|
151
155
|
int_value = int(hex_str, 16)
|
|
@@ -343,7 +347,7 @@ def get_current_user() -> 'models.User':
|
|
|
343
347
|
|
|
344
348
|
def get_current_user_name() -> str:
|
|
345
349
|
"""Returns the current user name."""
|
|
346
|
-
name =
|
|
350
|
+
name = get_current_user().name
|
|
347
351
|
assert name is not None
|
|
348
352
|
return name
|
|
349
353
|
|
|
@@ -886,7 +890,7 @@ def get_cleaned_username(username: str = '') -> str:
|
|
|
886
890
|
Returns:
|
|
887
891
|
A cleaned username.
|
|
888
892
|
"""
|
|
889
|
-
username = username or
|
|
893
|
+
username = username or get_current_user_name()
|
|
890
894
|
username = username.lower()
|
|
891
895
|
username = re.sub(r'[^a-z0-9-_]', '', username)
|
|
892
896
|
username = re.sub(r'^[0-9-]+', '', username)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Directory utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# This file is in '<project_root>/sky/utils/directory_utils.py'
|
|
6
|
+
# So we need to go up 2 levels to get to the '<project_root>/sky' directory
|
|
7
|
+
SKY_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_sky_dir():
|
|
11
|
+
"""Get the sky root directory."""
|
|
12
|
+
return SKY_DIR
|
sky/utils/env_options.py
CHANGED
|
@@ -24,6 +24,9 @@ class Options(enum.Enum):
|
|
|
24
24
|
# running in a Buildkite container environment, which requires special
|
|
25
25
|
# handling for networking between containers.
|
|
26
26
|
RUNNING_IN_BUILDKITE = ('BUILDKITE', False)
|
|
27
|
+
# Internal: This is used for testing to enable grpc for communication
|
|
28
|
+
# between the API server and the Skylet.
|
|
29
|
+
ENABLE_GRPC = ('SKYPILOT_ENABLE_GRPC', False)
|
|
27
30
|
|
|
28
31
|
def __init__(self, env_var: str, default: bool) -> None:
|
|
29
32
|
super().__init__()
|
|
@@ -8,9 +8,9 @@ from typing import Dict, Optional, Tuple
|
|
|
8
8
|
import colorama
|
|
9
9
|
import yaml
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky.adaptors import kubernetes
|
|
13
12
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import directory_utils
|
|
14
14
|
from sky.utils import rich_utils
|
|
15
15
|
|
|
16
16
|
|
|
@@ -71,8 +71,8 @@ def label(context: Optional[str] = None, wait_for_completion: bool = True):
|
|
|
71
71
|
f'Found {len(unlabeled_gpu_nodes)} '
|
|
72
72
|
'unlabeled GPU nodes in the cluster', colorama.Fore.YELLOW))
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
manifest_dir = os.path.join(directory_utils.get_sky_dir(),
|
|
75
|
+
'utils/kubernetes')
|
|
76
76
|
|
|
77
77
|
# Apply the RBAC manifest using kubectl since it contains multiple resources
|
|
78
78
|
with rich_utils.client_status('Setting up GPU labeling'):
|
sky/utils/schemas.py
CHANGED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Utilities for handling resource handles."""
|
|
2
|
+
import copy
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def prepare_handle_for_backwards_compatibility(
|
|
7
|
+
handle: typing.Any) -> typing.Any:
|
|
8
|
+
"""Prepare a handle for backwards compatibility with older clients."""
|
|
9
|
+
# skylet_ssh_tunnel was causing backwards compatibility issues with older
|
|
10
|
+
# clients: AttributeError: Can't get attribute 'SSHTunnelInfo'
|
|
11
|
+
#
|
|
12
|
+
# But it is not needed on the client side, so we can just remove it.
|
|
13
|
+
if handle is not None and hasattr(handle, 'skylet_ssh_tunnel'):
|
|
14
|
+
handle = copy.deepcopy(handle)
|
|
15
|
+
handle.skylet_ssh_tunnel = None
|
|
16
|
+
return handle
|
sky/volumes/client/sdk.py
CHANGED
|
@@ -33,13 +33,16 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
|
|
|
33
33
|
Returns:
|
|
34
34
|
The request ID of the apply request.
|
|
35
35
|
"""
|
|
36
|
-
body = payloads.VolumeApplyBody(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
body = payloads.VolumeApplyBody(
|
|
37
|
+
name=volume.name,
|
|
38
|
+
volume_type=volume.type,
|
|
39
|
+
cloud=volume.cloud,
|
|
40
|
+
region=volume.region,
|
|
41
|
+
zone=volume.zone,
|
|
42
|
+
size=volume.size,
|
|
43
|
+
config=volume.config,
|
|
44
|
+
labels=volume.labels,
|
|
45
|
+
)
|
|
43
46
|
response = requests.post(f'{server_common.get_server_url()}/volumes/apply',
|
|
44
47
|
json=json.loads(body.model_dump_json()),
|
|
45
48
|
cookies=server_common.get_api_cookie_jar())
|
sky/volumes/server/core.py
CHANGED
|
@@ -162,9 +162,16 @@ def volume_delete(names: List[str]) -> None:
|
|
|
162
162
|
global_user_state.delete_volume(name)
|
|
163
163
|
|
|
164
164
|
|
|
165
|
-
def volume_apply(
|
|
166
|
-
|
|
167
|
-
|
|
165
|
+
def volume_apply(
|
|
166
|
+
name: str,
|
|
167
|
+
volume_type: str,
|
|
168
|
+
cloud: str,
|
|
169
|
+
region: Optional[str],
|
|
170
|
+
zone: Optional[str],
|
|
171
|
+
size: Optional[str],
|
|
172
|
+
config: Dict[str, Any],
|
|
173
|
+
labels: Optional[Dict[str, str]] = None,
|
|
174
|
+
) -> None:
|
|
168
175
|
"""Creates or registers a volume.
|
|
169
176
|
|
|
170
177
|
Args:
|
|
@@ -175,6 +182,7 @@ def volume_apply(name: str, volume_type: str, cloud: str, region: Optional[str],
|
|
|
175
182
|
zone: The zone of the volume.
|
|
176
183
|
size: The size of the volume.
|
|
177
184
|
config: The configuration of the volume.
|
|
185
|
+
labels: The labels of the volume.
|
|
178
186
|
|
|
179
187
|
"""
|
|
180
188
|
with rich_utils.safe_status(ux_utils.spinner_message('Creating volume')):
|
|
@@ -195,6 +203,7 @@ def volume_apply(name: str, volume_type: str, cloud: str, region: Optional[str],
|
|
|
195
203
|
size=size,
|
|
196
204
|
config=config,
|
|
197
205
|
name_on_cloud=name_on_cloud,
|
|
206
|
+
labels=labels,
|
|
198
207
|
)
|
|
199
208
|
logger.debug(
|
|
200
209
|
f'Creating volume {name} on cloud {cloud} with config {config}')
|
sky/volumes/volume.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from sky.utils import common_utils
|
|
5
5
|
from sky.utils import infra_utils
|
|
6
|
+
from sky.utils import registry
|
|
6
7
|
from sky.utils import resources_utils
|
|
7
8
|
from sky.utils import schemas
|
|
8
9
|
|
|
@@ -16,6 +17,7 @@ class Volume:
|
|
|
16
17
|
type: Optional[str] = None, # pylint: disable=redefined-builtin
|
|
17
18
|
infra: Optional[str] = None,
|
|
18
19
|
size: Optional[str] = None,
|
|
20
|
+
labels: Optional[Dict[str, str]] = None,
|
|
19
21
|
resource_name: Optional[str] = None,
|
|
20
22
|
config: Optional[Dict[str, Any]] = None):
|
|
21
23
|
"""Initialize a Volume instance.
|
|
@@ -25,12 +27,14 @@ class Volume:
|
|
|
25
27
|
type: Volume type (e.g., 'k8s-pvc')
|
|
26
28
|
infra: Infrastructure specification
|
|
27
29
|
size: Volume size
|
|
30
|
+
labels: Volume labels
|
|
28
31
|
config: Additional configuration
|
|
29
32
|
"""
|
|
30
33
|
self.name = name
|
|
31
34
|
self.type = type
|
|
32
35
|
self.infra = infra
|
|
33
36
|
self.size = size
|
|
37
|
+
self.labels = labels or {}
|
|
34
38
|
self.resource_name = resource_name
|
|
35
39
|
self.config = config or {}
|
|
36
40
|
|
|
@@ -45,6 +49,7 @@ class Volume:
|
|
|
45
49
|
type=config_dict.get('type'),
|
|
46
50
|
infra=config_dict.get('infra'),
|
|
47
51
|
size=config_dict.get('size'),
|
|
52
|
+
labels=config_dict.get('labels'),
|
|
48
53
|
resource_name=config_dict.get('resource_name'),
|
|
49
54
|
config=config_dict.get('config', {}))
|
|
50
55
|
|
|
@@ -55,6 +60,7 @@ class Volume:
|
|
|
55
60
|
'type': self.type,
|
|
56
61
|
'infra': self.infra,
|
|
57
62
|
'size': self.size,
|
|
63
|
+
'labels': self.labels,
|
|
58
64
|
'resource_name': self.resource_name,
|
|
59
65
|
'config': self.config,
|
|
60
66
|
'cloud': self.cloud,
|
|
@@ -94,15 +100,15 @@ class Volume:
|
|
|
94
100
|
# Adjust the volume config (e.g., parse size)
|
|
95
101
|
self._adjust_config()
|
|
96
102
|
|
|
97
|
-
# Validate the volume config
|
|
98
|
-
self._validate_config()
|
|
99
|
-
|
|
100
103
|
# Resolve the infrastructure options to cloud, region, zone
|
|
101
104
|
infra_info = infra_utils.InfraInfo.from_str(self.infra)
|
|
102
105
|
self.cloud = infra_info.cloud
|
|
103
106
|
self.region = infra_info.region
|
|
104
107
|
self.zone = infra_info.zone
|
|
105
108
|
|
|
109
|
+
# Validate the volume config
|
|
110
|
+
self._validate_config()
|
|
111
|
+
|
|
106
112
|
def _adjust_config(self) -> None:
|
|
107
113
|
"""Adjust the volume config (e.g., parse size)."""
|
|
108
114
|
if self.size is None:
|
|
@@ -123,3 +129,11 @@ class Volume:
|
|
|
123
129
|
raise ValueError('Size is required for new volumes. '
|
|
124
130
|
'Please specify the size in the YAML file or '
|
|
125
131
|
'use the --size flag.')
|
|
132
|
+
if self.labels:
|
|
133
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
134
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
135
|
+
assert cloud_obj is not None
|
|
136
|
+
for key, value in self.labels.items():
|
|
137
|
+
valid, err_msg = cloud_obj.is_label_valid(key, value)
|
|
138
|
+
if not valid:
|
|
139
|
+
raise ValueError(f'{err_msg}')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250821
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -133,6 +133,8 @@ Requires-Dist: sqlalchemy_adapter; extra == "server"
|
|
|
133
133
|
Requires-Dist: passlib; extra == "server"
|
|
134
134
|
Requires-Dist: pyjwt; extra == "server"
|
|
135
135
|
Requires-Dist: aiohttp; extra == "server"
|
|
136
|
+
Requires-Dist: grpcio>=1.63.0; extra == "server"
|
|
137
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
136
138
|
Provides-Extra: all
|
|
137
139
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
138
140
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
@@ -187,6 +189,8 @@ Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
|
187
189
|
Requires-Dist: passlib; extra == "all"
|
|
188
190
|
Requires-Dist: pyjwt; extra == "all"
|
|
189
191
|
Requires-Dist: aiohttp; extra == "all"
|
|
192
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
193
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
190
194
|
Dynamic: author
|
|
191
195
|
Dynamic: classifier
|
|
192
196
|
Dynamic: description
|
|
@@ -222,7 +226,7 @@ Dynamic: summary
|
|
|
222
226
|
</p>
|
|
223
227
|
|
|
224
228
|
<h3 align="center">
|
|
225
|
-
|
|
229
|
+
Simplify & scale any AI infrastructure
|
|
226
230
|
</h3>
|
|
227
231
|
|
|
228
232
|
<div align="center">
|
|
@@ -242,26 +246,28 @@ Dynamic: summary
|
|
|
242
246
|
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
|
|
243
247
|
- [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
|
|
244
248
|
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
|
|
245
|
-
- [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
|
|
246
249
|
- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
|
|
247
|
-
- [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
|
|
248
|
-
- [Feb 2025] Prepare and serve large-scale image search with **vector databases**: [**blog post**](https://blog.skypilot.co/large-scale-vector-database/), [**example**](./examples/vector_database/)
|
|
249
|
-
- [Jan 2025] Launch and serve distilled models from **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** and **[Janus](https://github.com/deepseek-ai/DeepSeek-Janus)** on Kubernetes or any cloud: [**R1 example**](./llm/deepseek-r1-distilled/) and [**Janus example**](./llm/deepseek-janus/)
|
|
250
|
-
- [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
|
|
251
250
|
|
|
252
251
|
|
|
253
252
|
**LLM Finetuning Cookbooks**: Finetuning Llama 2 / Llama 3.1 in your own cloud environment, privately: Llama 2 [**example**](./llm/vicuna-llama-2/) and [**blog**](https://blog.skypilot.co/finetuning-llama2-operational-guide/); Llama 3.1 [**example**](./llm/llama-3_1-finetuning/) and [**blog**](https://blog.skypilot.co/finetune-llama-3_1-on-your-infra/)
|
|
254
253
|
|
|
255
254
|
----
|
|
256
255
|
|
|
257
|
-
SkyPilot is
|
|
256
|
+
SkyPilot is a system for running, managing, and scaling AI workloads on any AI infrastructure.
|
|
257
|
+
|
|
258
|
+
SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
|
|
259
|
+
**Infra teams** get a unified control plane to manage any AI compute — with advanced scheduling, scaling, and orchestration.
|
|
258
260
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
+

|
|
262
|
+
|
|
263
|
+
## Overview
|
|
264
|
+
|
|
265
|
+
SkyPilot **is easy to use for AI teams**:
|
|
266
|
+
- Quickly spin up compute on your own infra
|
|
261
267
|
- Environment and job as code — simple and portable
|
|
262
|
-
- Easy management: queue, run, and auto-recover many jobs
|
|
268
|
+
- Easy job management: queue, run, and auto-recover many jobs
|
|
263
269
|
|
|
264
|
-
SkyPilot **makes Kubernetes easy for AI teams**:
|
|
270
|
+
SkyPilot **makes Kubernetes easy for AI & Infra teams**:
|
|
265
271
|
|
|
266
272
|
- Slurm-like ease of use, cloud-native robustness
|
|
267
273
|
- Local dev experience on K8s: SSH into pods, sync code, or connect IDE
|
|
@@ -378,7 +384,9 @@ Source files can be found in [`llm/`](https://github.com/skypilot-org/skypilot/t
|
|
|
378
384
|
## More information
|
|
379
385
|
To learn more, see [SkyPilot Overview](https://docs.skypilot.co/en/latest/overview.html), [SkyPilot docs](https://docs.skypilot.co/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/).
|
|
380
386
|
|
|
381
|
-
|
|
387
|
+
SkyPilot adopters: [Testimonials and Case Studies](https://blog.skypilot.co/case-studies/)
|
|
388
|
+
|
|
389
|
+
Partners and integrations: [Community Spotlights](https://blog.skypilot.co/community/)
|
|
382
390
|
|
|
383
391
|
Follow updates:
|
|
384
392
|
- [Slack](http://slack.skypilot.co)
|