skypilot-nightly 1.0.0.dev20250716__py3-none-any.whl → 1.0.0.dev20250718__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/backends/backend.py +8 -4
- sky/backends/cloud_vm_ray_backend.py +50 -1
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +2 -1
- sky/catalog/common.py +60 -50
- sky/catalog/data_fetchers/fetch_gcp.py +1 -0
- sky/catalog/gcp_catalog.py +24 -7
- sky/catalog/kubernetes_catalog.py +5 -1
- sky/client/cli/command.py +180 -77
- sky/client/cli/git.py +549 -0
- sky/client/common.py +1 -1
- sky/client/sdk.py +1 -1
- sky/clouds/gcp.py +1 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{gVXjeFhvtWXyOsx9xYNvM → FUjweqdImyeYhMYFON-Se}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1043-734e57d2b27dfe5d.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9984.b56614f3c4c5961d.js → 9984.2b5e3fa69171bff9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa406155b4223d0d.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-14d404b7dd28502a.js → [job]-c5b357bfd9502fbe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/execution.py +1 -1
- sky/jobs/client/sdk.py +1 -1
- sky/jobs/server/core.py +14 -0
- sky/provision/kubernetes/utils.py +6 -0
- sky/serve/client/sdk.py +1 -1
- sky/server/common.py +8 -3
- sky/server/rest.py +71 -26
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +2 -0
- sky/task.py +12 -2
- sky/utils/command_runner.py +144 -35
- sky/utils/controller_utils.py +4 -3
- sky/utils/git.py +9 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/schemas.py +15 -1
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/RECORD +60 -57
- sky/dashboard/out/_next/static/chunks/1043-90a88c46f27b3df5.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.c139c0124e677fc8.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-743abf4bc86baf48.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/webpack-3fad5d4a0541a02d.js +0 -1
- /sky/dashboard/out/_next/static/{gVXjeFhvtWXyOsx9xYNvM → FUjweqdImyeYhMYFON-Se}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250716.dist-info → skypilot_nightly-1.0.0.dev20250718.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '663a28261fc98dfa69214e1d4f1b0bb7b02664e0'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250718'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -88,6 +88,7 @@ from sky.admin_policy import UserRequest
|
|
88
88
|
from sky.catalog import list_accelerators
|
89
89
|
from sky.client.sdk import api_cancel
|
90
90
|
from sky.client.sdk import api_info
|
91
|
+
from sky.client.sdk import api_login
|
91
92
|
from sky.client.sdk import api_server_logs
|
92
93
|
from sky.client.sdk import api_start
|
93
94
|
from sky.client.sdk import api_status
|
@@ -206,6 +207,7 @@ __all__ = [
|
|
206
207
|
'api_status',
|
207
208
|
'api_cancel',
|
208
209
|
'api_info',
|
210
|
+
'api_login',
|
209
211
|
'api_start',
|
210
212
|
'api_stop',
|
211
213
|
'api_server_logs',
|
sky/backends/backend.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Sky backend interface."""
|
2
2
|
import typing
|
3
|
-
from typing import Dict, Generic, Optional, Tuple
|
3
|
+
from typing import Any, Dict, Generic, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from sky.usage import usage_lib
|
6
6
|
from sky.utils import cluster_utils
|
@@ -90,8 +90,10 @@ class Backend(Generic[_ResourceHandleType]):
|
|
90
90
|
|
91
91
|
@timeline.event
|
92
92
|
@usage_lib.messages.usage.update_runtime('sync_workdir')
|
93
|
-
def sync_workdir(self, handle: _ResourceHandleType,
|
94
|
-
|
93
|
+
def sync_workdir(self, handle: _ResourceHandleType,
|
94
|
+
workdir: Union[Path, Dict[str, Any]],
|
95
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
96
|
+
return self._sync_workdir(handle, workdir, envs_and_secrets)
|
95
97
|
|
96
98
|
@timeline.event
|
97
99
|
@usage_lib.messages.usage.update_runtime('sync_file_mounts')
|
@@ -165,7 +167,9 @@ class Backend(Generic[_ResourceHandleType]):
|
|
165
167
|
) -> Tuple[Optional[_ResourceHandleType], bool]:
|
166
168
|
raise NotImplementedError
|
167
169
|
|
168
|
-
def _sync_workdir(self, handle: _ResourceHandleType,
|
170
|
+
def _sync_workdir(self, handle: _ResourceHandleType,
|
171
|
+
workdir: Union[Path, Dict[str, Any]],
|
172
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
169
173
|
raise NotImplementedError
|
170
174
|
|
171
175
|
def _sync_file_mounts(
|
@@ -3240,10 +3240,59 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3240
3240
|
common_utils.remove_file_if_exists(lock_path)
|
3241
3241
|
|
3242
3242
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
3243
|
-
workdir: Path
|
3243
|
+
workdir: Union[Path, Dict[str, Any]],
|
3244
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
3244
3245
|
# Even though provision() takes care of it, there may be cases where
|
3245
3246
|
# this function is called in isolation, without calling provision(),
|
3246
3247
|
# e.g., in CLI. So we should rerun rsync_up.
|
3248
|
+
if isinstance(workdir, dict):
|
3249
|
+
self._sync_git_workdir(handle, envs_and_secrets)
|
3250
|
+
else:
|
3251
|
+
self._sync_path_workdir(handle, workdir)
|
3252
|
+
|
3253
|
+
def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
|
3254
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
3255
|
+
style = colorama.Style
|
3256
|
+
ip_list = handle.external_ips()
|
3257
|
+
assert ip_list is not None, 'external_ips is not cached in handle'
|
3258
|
+
|
3259
|
+
log_path = os.path.join(self.log_dir, 'workdir_sync.log')
|
3260
|
+
|
3261
|
+
# TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
|
3262
|
+
runners = handle.get_command_runners()
|
3263
|
+
|
3264
|
+
def _sync_git_workdir_node(
|
3265
|
+
runner: command_runner.CommandRunner) -> None:
|
3266
|
+
# Type assertion to help mypy understand the type
|
3267
|
+
assert hasattr(
|
3268
|
+
runner, 'git_clone'
|
3269
|
+
), f'CommandRunner should have git_clone method, ' \
|
3270
|
+
f'got {type(runner)}'
|
3271
|
+
runner.git_clone(
|
3272
|
+
target_dir=SKY_REMOTE_WORKDIR,
|
3273
|
+
log_path=log_path,
|
3274
|
+
stream_logs=False,
|
3275
|
+
max_retry=3,
|
3276
|
+
envs_and_secrets=envs_and_secrets,
|
3277
|
+
)
|
3278
|
+
|
3279
|
+
num_nodes = handle.launched_nodes
|
3280
|
+
plural = 's' if num_nodes > 1 else ''
|
3281
|
+
logger.info(
|
3282
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
3283
|
+
f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3284
|
+
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
3285
|
+
os.system(f'touch {log_path}')
|
3286
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
3287
|
+
str(handle.launched_resources.cloud))
|
3288
|
+
with rich_utils.safe_status(
|
3289
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
3290
|
+
subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
|
3291
|
+
num_threads)
|
3292
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
3293
|
+
|
3294
|
+
def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
|
3295
|
+
workdir: Path) -> None:
|
3247
3296
|
fore = colorama.Fore
|
3248
3297
|
style = colorama.Style
|
3249
3298
|
ip_list = handle.external_ips()
|
sky/backends/docker_utils.py
CHANGED
@@ -168,7 +168,7 @@ def build_dockerimage(task: task_mod.Task,
|
|
168
168
|
build_dir=temp_dir)
|
169
169
|
|
170
170
|
dst = os.path.join(temp_dir, SKY_DOCKER_WORKDIR)
|
171
|
-
if task.workdir is not None:
|
171
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
172
172
|
# Copy workdir contents to tempdir
|
173
173
|
shutil.copytree(os.path.expanduser(task.workdir), dst)
|
174
174
|
else:
|
@@ -178,7 +178,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
178
178
|
return handle, False
|
179
179
|
|
180
180
|
def _sync_workdir(self, handle: LocalDockerResourceHandle,
|
181
|
-
workdir: Path
|
181
|
+
workdir: Union[Path, Dict[str, Any]],
|
182
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
182
183
|
"""Workdir is sync'd by adding to the docker image.
|
183
184
|
|
184
185
|
This happens in the execute step.
|
sky/catalog/common.py
CHANGED
@@ -13,6 +13,7 @@ from sky import sky_logging
|
|
13
13
|
from sky.adaptors import common as adaptors_common
|
14
14
|
from sky.clouds import cloud as cloud_lib
|
15
15
|
from sky.skylet import constants
|
16
|
+
from sky.utils import annotations
|
16
17
|
from sky.utils import common_utils
|
17
18
|
from sky.utils import registry
|
18
19
|
from sky.utils import rich_utils
|
@@ -125,17 +126,21 @@ class LazyDataFrame:
|
|
125
126
|
|
126
127
|
We don't need to load the catalog for every SkyPilot call, and this class
|
127
128
|
allows us to load the catalog only when needed.
|
129
|
+
|
130
|
+
Use update_if_stale_func to pass in a function that decides whether to
|
131
|
+
update the catalog on disk, updates it if needed, and returns
|
132
|
+
a bool indicating whether the update was done.
|
128
133
|
"""
|
129
134
|
|
130
|
-
def __init__(self, filename: str,
|
135
|
+
def __init__(self, filename: str, update_if_stale_func: Callable[[], bool]):
|
131
136
|
self._filename = filename
|
132
137
|
self._df: Optional['pd.DataFrame'] = None
|
133
|
-
self.
|
138
|
+
self._update_if_stale_func = update_if_stale_func
|
134
139
|
|
140
|
+
@annotations.lru_cache(scope='request')
|
135
141
|
def _load_df(self) -> 'pd.DataFrame':
|
136
|
-
if self._df is None:
|
142
|
+
if self._update_if_stale_func() or self._df is None:
|
137
143
|
try:
|
138
|
-
self._update_func()
|
139
144
|
self._df = pd.read_csv(self._filename)
|
140
145
|
except Exception as e: # pylint: disable=broad-except
|
141
146
|
# As users can manually modify the catalog, read_csv can fail.
|
@@ -193,55 +198,60 @@ def read_catalog(filename: str,
|
|
193
198
|
return last_update + pull_frequency_hours * 3600 < time.time()
|
194
199
|
|
195
200
|
def _update_catalog():
|
201
|
+
# Fast path: Exit early to avoid lock contention.
|
202
|
+
if not _need_update():
|
203
|
+
return False
|
204
|
+
|
196
205
|
# Atomic check, to avoid conflicts with other processes.
|
197
206
|
with filelock.FileLock(meta_path + '.lock'):
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
'connection.')
|
231
|
-
with ux_utils.print_exception_no_traceback():
|
232
|
-
raise e
|
207
|
+
# Double check after acquiring the lock.
|
208
|
+
if not _need_update():
|
209
|
+
return False
|
210
|
+
|
211
|
+
url = f'{constants.HOSTED_CATALOG_DIR_URL}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
212
|
+
url_fallback = f'{constants.HOSTED_CATALOG_DIR_URL_S3_MIRROR}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
213
|
+
headers = {'User-Agent': 'SkyPilot/0.7'}
|
214
|
+
update_frequency_str = ''
|
215
|
+
if pull_frequency_hours is not None:
|
216
|
+
update_frequency_str = (
|
217
|
+
f' (every {pull_frequency_hours} hours)')
|
218
|
+
with rich_utils.safe_status(
|
219
|
+
ux_utils.spinner_message(
|
220
|
+
f'Updating {cloud} catalog: {filename}') +
|
221
|
+
f'{update_frequency_str}'):
|
222
|
+
try:
|
223
|
+
r = requests.get(url=url, headers=headers)
|
224
|
+
if r.status_code == 429:
|
225
|
+
# fallback to s3 mirror, github introduced rate
|
226
|
+
# limit after 2025-05, see
|
227
|
+
# https://github.com/skypilot-org/skypilot/issues/5438
|
228
|
+
# for more details
|
229
|
+
r = requests.get(url=url_fallback, headers=headers)
|
230
|
+
r.raise_for_status()
|
231
|
+
except requests.exceptions.RequestException as e:
|
232
|
+
error_str = (f'Failed to fetch {cloud} catalog '
|
233
|
+
f'{filename}. ')
|
234
|
+
if os.path.exists(catalog_path):
|
235
|
+
logger.warning(
|
236
|
+
f'{error_str}Using cached catalog files.')
|
237
|
+
# Update catalog file modification time.
|
238
|
+
os.utime(catalog_path, None) # Sets to current time
|
233
239
|
else:
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
240
|
+
logger.error(f'{error_str}Please check your internet '
|
241
|
+
'connection.')
|
242
|
+
with ux_utils.print_exception_no_traceback():
|
243
|
+
raise e
|
244
|
+
else:
|
245
|
+
# Download successful, save the catalog to a local file.
|
246
|
+
os.makedirs(os.path.dirname(catalog_path), exist_ok=True)
|
247
|
+
with open(catalog_path, 'w', encoding='utf-8') as f:
|
248
|
+
f.write(r.text)
|
249
|
+
with open(meta_path + '.md5', 'w', encoding='utf-8') as f:
|
250
|
+
f.write(hashlib.md5(r.text.encode()).hexdigest())
|
251
|
+
logger.debug(f'Updated {cloud} catalog {filename}.')
|
252
|
+
return True
|
253
|
+
|
254
|
+
return LazyDataFrame(catalog_path, update_if_stale_func=_update_catalog)
|
245
255
|
|
246
256
|
|
247
257
|
def _get_instance_type(
|
sky/catalog/gcp_catalog.py
CHANGED
@@ -37,20 +37,37 @@ _image_df = common.read_catalog('gcp/images.csv',
|
|
37
37
|
_quotas_df = common.read_catalog('gcp/accelerator_quota_mapping.csv',
|
38
38
|
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
39
39
|
|
40
|
-
# We will select from the following
|
40
|
+
# We will select from the following six CPU instance families:
|
41
41
|
_DEFAULT_INSTANCE_FAMILY = [
|
42
|
-
# This is
|
43
|
-
# CPU: Intel Ice Lake
|
42
|
+
# This is a widely used general-purpose instance family as of July 2025.
|
43
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
44
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
44
45
|
# Memory: 4 GiB RAM per 1 vCPU;
|
45
46
|
'n2-standard',
|
46
|
-
# This is
|
47
|
-
# CPU: Intel Ice Lake
|
47
|
+
# This is a memory-optimized instance family as of July 2025.
|
48
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
49
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
48
50
|
# Memory: 8 GiB RAM per 1 vCPU;
|
49
51
|
'n2-highmem',
|
50
|
-
# This is
|
51
|
-
# CPU: Intel Ice Lake
|
52
|
+
# This is a compute-optimized instance family as of July 2025.
|
53
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
54
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
52
55
|
# Memory: 1 GiB RAM per 1 vCPU;
|
53
56
|
'n2-highcpu',
|
57
|
+
# This is the latest general-purpose instance family as of July 2025.
|
58
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
59
|
+
# Memory: 4 GiB RAM per 1 vCPU;
|
60
|
+
'n4-standard',
|
61
|
+
# This is the latest general-purpose instance family
|
62
|
+
# with a higher vCPU to memory ratio as of July 2025.
|
63
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
64
|
+
# Memory: 2 GiB RAM per 1 vCPU;
|
65
|
+
'n4-highcpu',
|
66
|
+
# This is the latest general-purpose instance family
|
67
|
+
# with a lower vCPU to memory ratio as of July 2025.
|
68
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
69
|
+
# Memory: 8 GiB RAM per 1 vCPU;
|
70
|
+
'n4-highmem',
|
54
71
|
]
|
55
72
|
# n2 is not allowed for launching GPUs for now.
|
56
73
|
_DEFAULT_HOST_VM_FAMILY = (
|
@@ -195,6 +195,10 @@ def _list_accelerators(
|
|
195
195
|
accelerator_name = lf.get_accelerator_from_label_value(
|
196
196
|
node.metadata.labels.get(key))
|
197
197
|
|
198
|
+
# Heterogenous cluster may have some nodes with empty labels.
|
199
|
+
if not accelerator_name:
|
200
|
+
continue
|
201
|
+
|
198
202
|
# Exclude multi-host TPUs from being processed.
|
199
203
|
# TODO(Doyoung): Remove the logic when adding support for
|
200
204
|
# multi-host TPUs.
|
@@ -212,7 +216,7 @@ def _list_accelerators(
|
|
212
216
|
kubernetes_utils.get_node_accelerator_count(
|
213
217
|
node.status.allocatable))
|
214
218
|
|
215
|
-
if
|
219
|
+
if accelerator_count > 0:
|
216
220
|
# TPUs are counted in a different way compared to GPUs.
|
217
221
|
# Multi-node GPUs can be split into smaller units and be
|
218
222
|
# provisioned, but TPUs are considered as an atomic unit.
|