skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +200 -78
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +224 -38
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -11
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +12 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +2 -3
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/server/requests/payloads.py
CHANGED
|
@@ -316,6 +316,9 @@ class StatusBody(RequestBody):
|
|
|
316
316
|
all_users: bool = True
|
|
317
317
|
# TODO (kyuds): default to False post 0.10.5
|
|
318
318
|
include_credentials: bool = True
|
|
319
|
+
# Only return fields that are needed for the
|
|
320
|
+
# dashboard / CLI summary response
|
|
321
|
+
summary_response: bool = False
|
|
319
322
|
|
|
320
323
|
|
|
321
324
|
class StartBody(RequestBody):
|
|
@@ -475,6 +478,17 @@ class VolumeListBody(RequestBody):
|
|
|
475
478
|
pass
|
|
476
479
|
|
|
477
480
|
|
|
481
|
+
class VolumeValidateBody(RequestBody):
|
|
482
|
+
"""The request body for the volume validate endpoint."""
|
|
483
|
+
name: Optional[str] = None
|
|
484
|
+
volume_type: Optional[str] = None
|
|
485
|
+
infra: Optional[str] = None
|
|
486
|
+
size: Optional[str] = None
|
|
487
|
+
labels: Optional[Dict[str, str]] = None
|
|
488
|
+
resource_name: Optional[str] = None
|
|
489
|
+
config: Optional[Dict[str, Any]] = None
|
|
490
|
+
|
|
491
|
+
|
|
478
492
|
class EndpointsBody(RequestBody):
|
|
479
493
|
"""The request body for the endpoint."""
|
|
480
494
|
cluster: str
|
|
@@ -669,9 +683,15 @@ class LocalUpBody(RequestBody):
|
|
|
669
683
|
ssh_key: Optional[str] = None
|
|
670
684
|
cleanup: bool = False
|
|
671
685
|
context_name: Optional[str] = None
|
|
686
|
+
name: Optional[str] = None
|
|
672
687
|
password: Optional[str] = None
|
|
673
688
|
|
|
674
689
|
|
|
690
|
+
class LocalDownBody(RequestBody):
|
|
691
|
+
"""The request body for the local down endpoint."""
|
|
692
|
+
name: Optional[str] = None
|
|
693
|
+
|
|
694
|
+
|
|
675
695
|
class SSHUpBody(RequestBody):
|
|
676
696
|
"""The request body for the SSH up/down endpoints."""
|
|
677
697
|
infra: Optional[str] = None
|
|
@@ -709,19 +729,22 @@ class JobsDownloadLogsBody(RequestBody):
|
|
|
709
729
|
|
|
710
730
|
class JobsPoolApplyBody(RequestBody):
|
|
711
731
|
"""The request body for the jobs pool apply endpoint."""
|
|
712
|
-
task: str
|
|
732
|
+
task: Optional[str] = None
|
|
733
|
+
workers: Optional[int] = None
|
|
713
734
|
pool_name: str
|
|
714
735
|
mode: serve.UpdateMode
|
|
715
736
|
|
|
716
737
|
def to_kwargs(self) -> Dict[str, Any]:
|
|
717
738
|
kwargs = super().to_kwargs()
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
739
|
+
if self.task is not None:
|
|
740
|
+
dag = common.process_mounts_in_task_on_api_server(
|
|
741
|
+
self.task, self.env_vars, workdir_only=False)
|
|
742
|
+
assert len(
|
|
743
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
|
744
|
+
'a pool.', dag)
|
|
745
|
+
kwargs['task'] = dag.tasks[0]
|
|
746
|
+
else:
|
|
747
|
+
kwargs['task'] = None
|
|
725
748
|
return kwargs
|
|
726
749
|
|
|
727
750
|
|
|
@@ -146,10 +146,9 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
146
146
|
self.cluster_name = cluster_name
|
|
147
147
|
|
|
148
148
|
async def check(self) -> Tuple[bool, Optional[str]]:
|
|
149
|
-
|
|
149
|
+
cluster_status = global_user_state.get_status_from_cluster_name(
|
|
150
150
|
self.cluster_name)
|
|
151
|
-
if
|
|
152
|
-
cluster_record['status'] is status_lib.ClusterStatus.UP):
|
|
151
|
+
if cluster_status is status_lib.ClusterStatus.UP:
|
|
153
152
|
# Shortcut for started clusters, ignore cluster not found
|
|
154
153
|
# since the cluster record might not yet be created by the
|
|
155
154
|
# launch task.
|
sky/server/rest.py
CHANGED
|
@@ -9,6 +9,7 @@ import typing
|
|
|
9
9
|
from typing import Any, Callable, cast, Optional, TypeVar
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
|
+
import urllib3.exceptions
|
|
12
13
|
|
|
13
14
|
from sky import exceptions
|
|
14
15
|
from sky import sky_logging
|
|
@@ -53,6 +54,7 @@ _session.headers[constants.VERSION_HEADER] = (
|
|
|
53
54
|
_transient_errors = [
|
|
54
55
|
requests.exceptions.RequestException,
|
|
55
56
|
ConnectionError,
|
|
57
|
+
urllib3.exceptions.HTTPError,
|
|
56
58
|
]
|
|
57
59
|
|
|
58
60
|
|
sky/server/server.py
CHANGED
|
@@ -445,6 +445,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
445
445
|
loop.call_at(target, tick)
|
|
446
446
|
|
|
447
447
|
|
|
448
|
+
def schedule_on_boot_check():
|
|
449
|
+
try:
|
|
450
|
+
executor.schedule_request(
|
|
451
|
+
request_id='skypilot-server-on-boot-check',
|
|
452
|
+
request_name='check',
|
|
453
|
+
request_body=payloads.CheckBody(),
|
|
454
|
+
func=sky_check.check,
|
|
455
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
456
|
+
is_skypilot_system=True,
|
|
457
|
+
)
|
|
458
|
+
except exceptions.RequestAlreadyExistsError:
|
|
459
|
+
# Lifespan will be executed in each uvicorn worker process, we
|
|
460
|
+
# can safely ignore the error if the task is already scheduled.
|
|
461
|
+
logger.debug('Request skypilot-server-on-boot-check already exists.')
|
|
462
|
+
|
|
463
|
+
|
|
448
464
|
@contextlib.asynccontextmanager
|
|
449
465
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
450
466
|
"""FastAPI lifespan context manager."""
|
|
@@ -469,6 +485,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
469
485
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
470
486
|
# can safely ignore the error if the task is already scheduled.
|
|
471
487
|
logger.debug(f'Request {event.id} already exists.')
|
|
488
|
+
schedule_on_boot_check()
|
|
472
489
|
asyncio.create_task(cleanup_upload_ids())
|
|
473
490
|
if metrics_utils.METRICS_ENABLED:
|
|
474
491
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -1216,19 +1233,8 @@ async def logs(
|
|
|
1216
1233
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1217
1234
|
request_cluster_name=cluster_job_body.cluster_name,
|
|
1218
1235
|
)
|
|
1219
|
-
task =
|
|
1220
|
-
|
|
1221
|
-
async def cancel_task():
|
|
1222
|
-
try:
|
|
1223
|
-
logger.info('Client disconnected for request: '
|
|
1224
|
-
f'{request.state.request_id}')
|
|
1225
|
-
task.cancel()
|
|
1226
|
-
await task
|
|
1227
|
-
except asyncio.CancelledError:
|
|
1228
|
-
pass
|
|
1229
|
-
|
|
1230
|
-
# Cancel the task after the request is done or client disconnects
|
|
1231
|
-
background_tasks.add_task(cancel_task)
|
|
1236
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
1237
|
+
background_tasks.add_task(task.cancel)
|
|
1232
1238
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1233
1239
|
# the same approach as /stream.
|
|
1234
1240
|
return stream_utils.stream_response(
|
|
@@ -1354,10 +1360,12 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1354
1360
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1355
1361
|
|
|
1356
1362
|
return fastapi.responses.StreamingResponse(
|
|
1357
|
-
content=stream_utils.log_streamer(
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1363
|
+
content=stream_utils.log_streamer(
|
|
1364
|
+
None,
|
|
1365
|
+
log_path,
|
|
1366
|
+
tail=effective_tail,
|
|
1367
|
+
follow=follow,
|
|
1368
|
+
cluster_name=cluster_body.cluster_name),
|
|
1361
1369
|
media_type='text/plain',
|
|
1362
1370
|
headers={
|
|
1363
1371
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1419,12 +1427,13 @@ async def local_up(request: fastapi.Request,
|
|
|
1419
1427
|
|
|
1420
1428
|
|
|
1421
1429
|
@app.post('/local_down')
|
|
1422
|
-
async def local_down(request: fastapi.Request
|
|
1430
|
+
async def local_down(request: fastapi.Request,
|
|
1431
|
+
local_down_body: payloads.LocalDownBody) -> None:
|
|
1423
1432
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1424
1433
|
executor.schedule_request(
|
|
1425
1434
|
request_id=request.state.request_id,
|
|
1426
1435
|
request_name='local_down',
|
|
1427
|
-
request_body=
|
|
1436
|
+
request_body=local_down_body,
|
|
1428
1437
|
func=core.local_down,
|
|
1429
1438
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
1430
1439
|
)
|
sky/server/stream_utils.py
CHANGED
|
@@ -8,10 +8,12 @@ from typing import AsyncGenerator, Deque, List, Optional
|
|
|
8
8
|
import aiofiles
|
|
9
9
|
import fastapi
|
|
10
10
|
|
|
11
|
+
from sky import global_user_state
|
|
11
12
|
from sky import sky_logging
|
|
12
13
|
from sky.server.requests import requests as requests_lib
|
|
13
14
|
from sky.utils import message_utils
|
|
14
15
|
from sky.utils import rich_utils
|
|
16
|
+
from sky.utils import status_lib
|
|
15
17
|
|
|
16
18
|
logger = sky_logging.init_logger(__name__)
|
|
17
19
|
|
|
@@ -22,6 +24,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
22
24
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
23
25
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
24
26
|
_HEARTBEAT_INTERVAL = 30
|
|
27
|
+
_CLUSTER_STATUS_INTERVAL = 1
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
async def _yield_log_file_with_payloads_skipped(
|
|
@@ -37,11 +40,13 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
37
40
|
yield line_str
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
async def log_streamer(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
async def log_streamer(
|
|
44
|
+
request_id: Optional[str],
|
|
45
|
+
log_path: pathlib.Path,
|
|
46
|
+
plain_logs: bool = False,
|
|
47
|
+
tail: Optional[int] = None,
|
|
48
|
+
follow: bool = True,
|
|
49
|
+
cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
|
|
45
50
|
"""Streams the logs of a request.
|
|
46
51
|
|
|
47
52
|
Args:
|
|
@@ -51,6 +56,8 @@ async def log_streamer(request_id: Optional[str],
|
|
|
51
56
|
plain_logs: Whether to show plain logs.
|
|
52
57
|
tail: The number of lines to tail. If None, tail the whole file.
|
|
53
58
|
follow: Whether to follow the log file.
|
|
59
|
+
cluster_name: The cluster name to check status for provision logs.
|
|
60
|
+
If provided and cluster status is UP, streaming will terminate.
|
|
54
61
|
"""
|
|
55
62
|
|
|
56
63
|
if request_id is not None:
|
|
@@ -104,15 +111,17 @@ async def log_streamer(request_id: Optional[str],
|
|
|
104
111
|
|
|
105
112
|
async with aiofiles.open(log_path, 'rb') as f:
|
|
106
113
|
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
107
|
-
follow):
|
|
114
|
+
follow, cluster_name):
|
|
108
115
|
yield chunk
|
|
109
116
|
|
|
110
117
|
|
|
111
|
-
async def _tail_log_file(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
118
|
+
async def _tail_log_file(
|
|
119
|
+
f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
120
|
+
request_id: Optional[str] = None,
|
|
121
|
+
plain_logs: bool = False,
|
|
122
|
+
tail: Optional[int] = None,
|
|
123
|
+
follow: bool = True,
|
|
124
|
+
cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
|
|
116
125
|
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
|
117
126
|
|
|
118
127
|
if tail is not None:
|
|
@@ -128,6 +137,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
128
137
|
yield line_str
|
|
129
138
|
|
|
130
139
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
140
|
+
last_cluster_status_check_time = asyncio.get_event_loop().time()
|
|
131
141
|
|
|
132
142
|
# Buffer the lines in memory and flush them in chunks to improve log
|
|
133
143
|
# tailing throughput.
|
|
@@ -176,7 +186,19 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
176
186
|
break
|
|
177
187
|
if not follow:
|
|
178
188
|
break
|
|
179
|
-
|
|
189
|
+
# Provision logs pass in cluster_name, check cluster status
|
|
190
|
+
# periodically to see if provisioning is done. We only
|
|
191
|
+
# check once a second to avoid overloading the DB.
|
|
192
|
+
check_status = (current_time - last_cluster_status_check_time
|
|
193
|
+
) >= _CLUSTER_STATUS_INTERVAL
|
|
194
|
+
if cluster_name is not None and check_status:
|
|
195
|
+
cluster_record = await (
|
|
196
|
+
global_user_state.get_status_from_cluster_name_async(
|
|
197
|
+
cluster_name))
|
|
198
|
+
if (cluster_record is None or
|
|
199
|
+
cluster_record != status_lib.ClusterStatus.INIT):
|
|
200
|
+
break
|
|
201
|
+
last_cluster_status_check_time = current_time
|
|
180
202
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
181
203
|
# Currently just used to keep the connection busy, refer to
|
|
182
204
|
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -49,8 +49,15 @@ install_requires = [
|
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
51
|
'requests',
|
|
52
|
+
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
53
|
+
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
54
|
+
# changes.
|
|
55
|
+
# Notes for current version check:
|
|
56
|
+
# - uvicorn 0.33.0 is the latest version that supports Python 3.8
|
|
57
|
+
# - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
|
|
58
|
+
# behavior.
|
|
59
|
+
'uvicorn[standard] >=0.33.0, <0.36.0',
|
|
52
60
|
'fastapi',
|
|
53
|
-
'uvicorn[standard]',
|
|
54
61
|
# Some pydantic versions are not compatible with ray. Adopted from ray's
|
|
55
62
|
# setup.py:
|
|
56
63
|
# https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
|
|
@@ -105,6 +112,7 @@ server_dependencies = [
|
|
|
105
112
|
GRPC,
|
|
106
113
|
PROTOBUF,
|
|
107
114
|
'aiosqlite',
|
|
115
|
+
'greenlet',
|
|
108
116
|
]
|
|
109
117
|
|
|
110
118
|
local_ray = [
|
|
@@ -185,7 +193,9 @@ extras_require: Dict[str, List[str]] = {
|
|
|
185
193
|
'remote': remote,
|
|
186
194
|
# For the container registry auth api. Reference:
|
|
187
195
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
188
|
-
|
|
196
|
+
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
|
|
197
|
+
# stdlib provides tomllib; on lower versions we depend on tomli explicitly.
|
|
198
|
+
'runpod': ['runpod>=1.6.1', 'tomli; python_version < "3.11"'],
|
|
189
199
|
'fluidstack': [], # No dependencies needed for fluidstack
|
|
190
200
|
'cudo': ['cudo-compute>=0.1.10'],
|
|
191
201
|
'paperspace': [], # No dependencies needed for paperspace
|
sky/setup_files/setup.py
CHANGED
|
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
|
|
|
148
148
|
long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
|
|
149
149
|
long_description = parse_readme(long_description)
|
|
150
150
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
setuptools.setup(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
151
|
+
if __name__ == '__main__':
|
|
152
|
+
atexit.register(revert_commit_hash)
|
|
153
|
+
replace_commit_hash()
|
|
154
|
+
setuptools.setup(
|
|
155
|
+
# NOTE: this affects the package.whl wheel name. When changing this (if
|
|
156
|
+
# ever), you must grep for '.whl' and change all corresponding wheel paths
|
|
157
|
+
# (templates/*.j2 and wheel_utils.py).
|
|
158
|
+
name='skypilot-nightly',
|
|
159
|
+
version=find_version(),
|
|
160
|
+
packages=setuptools.find_packages(),
|
|
161
|
+
author='SkyPilot Team',
|
|
162
|
+
license='Apache 2.0',
|
|
163
|
+
readme='README.md',
|
|
164
|
+
description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
|
|
165
|
+
long_description=long_description,
|
|
166
|
+
long_description_content_type='text/markdown',
|
|
167
|
+
setup_requires=['wheel'],
|
|
168
|
+
requires_python='>=3.7',
|
|
169
|
+
install_requires=dependencies['install_requires'],
|
|
170
|
+
extras_require=dependencies['extras_require'],
|
|
171
|
+
entry_points={
|
|
172
|
+
'console_scripts': ['sky = sky.cli:cli'],
|
|
173
|
+
},
|
|
174
|
+
include_package_data=True,
|
|
175
|
+
classifiers=[
|
|
176
|
+
'Programming Language :: Python :: 3.7',
|
|
177
|
+
'Programming Language :: Python :: 3.8',
|
|
178
|
+
'Programming Language :: Python :: 3.9',
|
|
179
|
+
'Programming Language :: Python :: 3.10',
|
|
180
|
+
'Programming Language :: Python :: 3.11',
|
|
181
|
+
'Programming Language :: Python :: 3.12',
|
|
182
|
+
'Programming Language :: Python :: 3.13',
|
|
183
|
+
'License :: OSI Approved :: Apache Software License',
|
|
184
|
+
'Operating System :: OS Independent',
|
|
185
|
+
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
186
|
+
'Topic :: System :: Distributed Computing',
|
|
187
|
+
],
|
|
188
|
+
project_urls={
|
|
189
|
+
'Homepage': 'https://github.com/skypilot-org/skypilot',
|
|
190
|
+
'Issues': 'https://github.com/skypilot-org/skypilot/issues',
|
|
191
|
+
'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
|
|
192
|
+
'Documentation': 'https://docs.skypilot.co/',
|
|
193
|
+
},
|
|
194
|
+
)
|
sky/skylet/constants.py
CHANGED
|
@@ -64,9 +64,8 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
|
64
64
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
65
65
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
66
66
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
67
|
-
SKY_UV_RUN_CMD: str = (
|
|
68
|
-
|
|
69
|
-
'--no-project --no-config')
|
|
67
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
|
|
68
|
+
'--no-project --no-config')
|
|
70
69
|
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
71
70
|
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
72
71
|
# not work when conda is used.
|
|
@@ -510,6 +510,16 @@ available_node_types:
|
|
|
510
510
|
valueFrom:
|
|
511
511
|
fieldRef:
|
|
512
512
|
fieldPath: metadata.labels['ray-node-type']
|
|
513
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
514
|
+
valueFrom:
|
|
515
|
+
resourceFieldRef:
|
|
516
|
+
containerName: ray-node
|
|
517
|
+
resource: requests.cpu
|
|
518
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
519
|
+
valueFrom:
|
|
520
|
+
resourceFieldRef:
|
|
521
|
+
containerName: ray-node
|
|
522
|
+
resource: requests.memory
|
|
513
523
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
514
524
|
- name: {{ key }}
|
|
515
525
|
value: {{ value }}
|
|
@@ -630,13 +640,6 @@ available_node_types:
|
|
|
630
640
|
command: ["/bin/bash", "-c", "--"]
|
|
631
641
|
args:
|
|
632
642
|
- |
|
|
633
|
-
# For backwards compatibility, we put a marker file in the pod
|
|
634
|
-
# to indicate that the pod is running with the changes introduced
|
|
635
|
-
# in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
|
|
636
|
-
# TODO: Remove this marker file and it's usage in setup_commands
|
|
637
|
-
# after v0.10.0 release.
|
|
638
|
-
touch /tmp/skypilot_is_nimbus
|
|
639
|
-
|
|
640
643
|
# Helper function to conditionally use sudo
|
|
641
644
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
642
645
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
|
@@ -1333,18 +1336,16 @@ setup_commands:
|
|
|
1333
1336
|
# Wait for SSH setup to complete before proceeding
|
|
1334
1337
|
if [ -f /tmp/apt_ssh_setup_started ]; then
|
|
1335
1338
|
echo "=== Logs for asynchronous SSH setup ===";
|
|
1336
|
-
[ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
|
|
1337
|
-
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1339
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1340
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1338
1341
|
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1339
1342
|
fi
|
|
1340
1343
|
|
|
1341
1344
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1347
|
-
fi
|
|
1345
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1346
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1347
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1348
|
+
|
|
1348
1349
|
end_epoch=$(date +%s);
|
|
1349
1350
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1350
1351
|
start_epoch=$(date +%s);
|
sky/usage/usage_lib.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
|
|
|
14
14
|
|
|
15
15
|
import sky
|
|
16
16
|
from sky import sky_logging
|
|
17
|
+
from sky import skypilot_config
|
|
17
18
|
from sky.adaptors import common as adaptors_common
|
|
18
19
|
from sky.usage import constants
|
|
19
20
|
from sky.utils import common_utils
|
|
@@ -167,6 +168,7 @@ class UsageMessageToReport(MessageToReport):
|
|
|
167
168
|
self.runtimes: Dict[str, float] = {} # update_runtime
|
|
168
169
|
self.exception: Optional[str] = None # entrypoint_context
|
|
169
170
|
self.stacktrace: Optional[str] = None # entrypoint_context
|
|
171
|
+
self.skypilot_config: Optional[Dict[str, Any]] = None
|
|
170
172
|
|
|
171
173
|
# Whether API server is deployed remotely.
|
|
172
174
|
self.using_remote_api_server: bool = (
|
|
@@ -177,6 +179,7 @@ class UsageMessageToReport(MessageToReport):
|
|
|
177
179
|
self.client_entrypoint = common_utils.get_current_client_entrypoint(
|
|
178
180
|
msg)
|
|
179
181
|
self.entrypoint = msg
|
|
182
|
+
self.skypilot_config = dict(skypilot_config.to_dict())
|
|
180
183
|
|
|
181
184
|
def set_internal(self):
|
|
182
185
|
self.internal = True
|
|
@@ -11,6 +11,7 @@ from sky.utils import common_utils
|
|
|
11
11
|
from sky.utils import log_utils
|
|
12
12
|
from sky.utils import resources_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
14
|
+
from sky.utils import ux_utils
|
|
14
15
|
|
|
15
16
|
if typing.TYPE_CHECKING:
|
|
16
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
|
|
|
105
106
|
|
|
106
107
|
if query_clusters:
|
|
107
108
|
cluster_names = {record['name'] for record in cluster_records}
|
|
108
|
-
not_found_clusters =
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
if cluster not in cluster_names
|
|
112
|
-
]
|
|
109
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
110
|
+
query_clusters, cluster_names)
|
|
111
|
+
not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
|
|
113
112
|
if not_found_clusters:
|
|
114
113
|
cluster_str = 'Cluster'
|
|
115
114
|
if len(not_found_clusters) > 1:
|