skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/ssh_node_pools/server.py
CHANGED
|
@@ -7,6 +7,7 @@ import fastapi
|
|
|
7
7
|
from sky import core as sky_core
|
|
8
8
|
from sky.server.requests import executor
|
|
9
9
|
from sky.server.requests import payloads
|
|
10
|
+
from sky.server.requests import request_names
|
|
10
11
|
from sky.server.requests import requests as requests_lib
|
|
11
12
|
from sky.ssh_node_pools import core as ssh_node_pools_core
|
|
12
13
|
from sky.utils import common_utils
|
|
@@ -101,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
101
102
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
103
|
await executor.schedule_request_async(
|
|
103
104
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
105
106
|
request_body=ssh_up_body,
|
|
106
107
|
func=sky_core.ssh_up,
|
|
107
108
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -126,7 +127,7 @@ async def deploy_ssh_node_pool_general(
|
|
|
126
127
|
try:
|
|
127
128
|
await executor.schedule_request_async(
|
|
128
129
|
request_id=request.state.request_id,
|
|
129
|
-
request_name=
|
|
130
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
130
131
|
request_body=ssh_up_body,
|
|
131
132
|
func=sky_core.ssh_up,
|
|
132
133
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -152,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
152
153
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
154
|
await executor.schedule_request_async(
|
|
154
155
|
request_id=request.state.request_id,
|
|
155
|
-
request_name=
|
|
156
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
156
157
|
request_body=ssh_up_body,
|
|
157
158
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
158
159
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -180,7 +181,7 @@ async def down_ssh_node_pool_general(
|
|
|
180
181
|
ssh_up_body.cleanup = True
|
|
181
182
|
await executor.schedule_request_async(
|
|
182
183
|
request_id=request.state.request_id,
|
|
183
|
-
request_name=
|
|
184
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
184
185
|
request_body=ssh_up_body,
|
|
185
186
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
186
187
|
schedule_type=requests_lib.ScheduleType.LONG,
|
sky/task.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Task: a coarse-grained stage in an application."""
|
|
2
2
|
import collections
|
|
3
|
-
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import re
|
|
@@ -29,10 +28,6 @@ from sky.utils import yaml_utils
|
|
|
29
28
|
|
|
30
29
|
logger = sky_logging.init_logger(__name__)
|
|
31
30
|
|
|
32
|
-
# A lambda generating commands (node rank_i, node addrs -> cmd_i).
|
|
33
|
-
CommandGen = Callable[[int, List[str]], Optional[str]]
|
|
34
|
-
CommandOrCommandGen = Union[str, CommandGen]
|
|
35
|
-
|
|
36
31
|
_VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
|
|
37
32
|
_VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
|
|
38
33
|
' uppercase letters, digits, underscores, periods,'
|
|
@@ -236,7 +231,7 @@ class Task:
|
|
|
236
231
|
name: Optional[str] = None,
|
|
237
232
|
*,
|
|
238
233
|
setup: Optional[Union[str, List[str]]] = None,
|
|
239
|
-
run: Optional[Union[
|
|
234
|
+
run: Optional[Union[str, List[str]]] = None,
|
|
240
235
|
envs: Optional[Dict[str, str]] = None,
|
|
241
236
|
secrets: Optional[Dict[str, str]] = None,
|
|
242
237
|
workdir: Optional[Union[str, Dict[str, Any]]] = None,
|
|
@@ -349,7 +344,7 @@ class Task:
|
|
|
349
344
|
self._volumes = volumes or {}
|
|
350
345
|
|
|
351
346
|
# concatenate commands if given as list
|
|
352
|
-
def _concat(commands):
|
|
347
|
+
def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
|
|
353
348
|
if isinstance(commands, list):
|
|
354
349
|
return '\n'.join(commands)
|
|
355
350
|
return commands
|
|
@@ -447,42 +442,9 @@ class Task:
|
|
|
447
442
|
|
|
448
443
|
def validate_run(self):
|
|
449
444
|
"""Validates if the run command is valid."""
|
|
450
|
-
if
|
|
451
|
-
run_sig = inspect.signature(self.run)
|
|
452
|
-
# Check that run is a function with 2 arguments.
|
|
453
|
-
if len(run_sig.parameters) != 2:
|
|
454
|
-
with ux_utils.print_exception_no_traceback():
|
|
455
|
-
raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
456
|
-
|
|
457
|
-
type_list = [int, List[str]]
|
|
458
|
-
# Check annotations, if exists
|
|
459
|
-
for i, param in enumerate(run_sig.parameters.values()):
|
|
460
|
-
if param.annotation != inspect.Parameter.empty:
|
|
461
|
-
if param.annotation != type_list[i]:
|
|
462
|
-
with ux_utils.print_exception_no_traceback():
|
|
463
|
-
raise ValueError(
|
|
464
|
-
_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
465
|
-
|
|
466
|
-
# Check self containedness.
|
|
467
|
-
run_closure = inspect.getclosurevars(self.run)
|
|
468
|
-
if run_closure.nonlocals:
|
|
469
|
-
with ux_utils.print_exception_no_traceback():
|
|
470
|
-
raise ValueError(
|
|
471
|
-
'run command generator must be self contained. '
|
|
472
|
-
f'Found nonlocals: {run_closure.nonlocals}')
|
|
473
|
-
if run_closure.globals:
|
|
474
|
-
with ux_utils.print_exception_no_traceback():
|
|
475
|
-
raise ValueError(
|
|
476
|
-
'run command generator must be self contained. '
|
|
477
|
-
f'Found globals: {run_closure.globals}')
|
|
478
|
-
if run_closure.unbound:
|
|
479
|
-
# Do not raise an error here. Import statements, which are
|
|
480
|
-
# allowed, will be considered as unbounded.
|
|
481
|
-
pass
|
|
482
|
-
elif self.run is not None and not isinstance(self.run, str):
|
|
445
|
+
if self.run is not None and not isinstance(self.run, str):
|
|
483
446
|
with ux_utils.print_exception_no_traceback():
|
|
484
|
-
raise ValueError('run must be
|
|
485
|
-
f'a command generator ({CommandGen}). '
|
|
447
|
+
raise ValueError('run must be a shell script (str). '
|
|
486
448
|
f'Got {type(self.run)}')
|
|
487
449
|
|
|
488
450
|
def expand_and_validate_file_mounts(self):
|
|
@@ -1552,6 +1514,16 @@ class Task:
|
|
|
1552
1514
|
self.update_file_mounts({
|
|
1553
1515
|
mnt_path: blob_path,
|
|
1554
1516
|
})
|
|
1517
|
+
elif store_type is storage_lib.StoreType.COREWEAVE:
|
|
1518
|
+
if storage.source is not None and not isinstance(
|
|
1519
|
+
storage.source,
|
|
1520
|
+
list) and storage.source.startswith('cw://'):
|
|
1521
|
+
blob_path = storage.source
|
|
1522
|
+
else:
|
|
1523
|
+
blob_path = 'cw://' + storage.name
|
|
1524
|
+
self.update_file_mounts({
|
|
1525
|
+
mnt_path: blob_path,
|
|
1526
|
+
})
|
|
1555
1527
|
else:
|
|
1556
1528
|
with ux_utils.print_exception_no_traceback():
|
|
1557
1529
|
raise ValueError(f'Storage Type {store_type} '
|
|
@@ -1059,7 +1059,7 @@ available_node_types:
|
|
|
1059
1059
|
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
1060
1060
|
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
1061
1061
|
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
1062
|
-
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.
|
|
1062
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
1063
1063
|
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1064
1064
|
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
1065
1065
|
fi
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -156,6 +156,7 @@ setup_commands:
|
|
|
156
156
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
157
157
|
{%- endfor %}
|
|
158
158
|
{%- endif %}
|
|
159
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
159
160
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
160
161
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
161
162
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
sky/templates/websocket_proxy.py
CHANGED
|
@@ -11,15 +11,23 @@ This script is useful for users who do not have local Kubernetes credentials.
|
|
|
11
11
|
import asyncio
|
|
12
12
|
from http.cookiejar import MozillaCookieJar
|
|
13
13
|
import os
|
|
14
|
+
import struct
|
|
14
15
|
import sys
|
|
15
|
-
|
|
16
|
+
import time
|
|
17
|
+
from typing import Dict, Optional
|
|
16
18
|
from urllib.request import Request
|
|
17
19
|
|
|
20
|
+
import requests
|
|
18
21
|
import websockets
|
|
19
22
|
from websockets.asyncio.client import ClientConnection
|
|
20
23
|
from websockets.asyncio.client import connect
|
|
21
24
|
|
|
25
|
+
from sky.server import constants
|
|
26
|
+
from sky.server.server import KubernetesSSHMessageType
|
|
27
|
+
from sky.skylet import constants as skylet_constants
|
|
28
|
+
|
|
22
29
|
BUFFER_SIZE = 2**16 # 64KB
|
|
30
|
+
HEARTBEAT_INTERVAL_SECONDS = 10
|
|
23
31
|
|
|
24
32
|
# Environment variable for a file path to the API cookie file.
|
|
25
33
|
# Keep in sync with server/constants.py
|
|
@@ -28,6 +36,8 @@ API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
|
|
|
28
36
|
# Keep in sync with server/constants.py
|
|
29
37
|
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
30
38
|
|
|
39
|
+
MAX_UNANSWERED_PINGS = 100
|
|
40
|
+
|
|
31
41
|
|
|
32
42
|
def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
33
43
|
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
|
@@ -49,7 +59,7 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
|
49
59
|
return {'Cookie': cookie_header}
|
|
50
60
|
|
|
51
61
|
|
|
52
|
-
async def main(url: str) -> None:
|
|
62
|
+
async def main(url: str, timestamps_supported: bool) -> None:
|
|
53
63
|
cookie_header = _get_cookie_header(url)
|
|
54
64
|
async with connect(url,
|
|
55
65
|
ping_interval=None,
|
|
@@ -75,45 +85,149 @@ async def main(url: str) -> None:
|
|
|
75
85
|
asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
|
|
76
86
|
stdout_writer = asyncio.StreamWriter(transport, protocol, None,
|
|
77
87
|
loop)
|
|
88
|
+
# Dictionary to store last ping time for latency measurement
|
|
89
|
+
last_ping_time_dict: Optional[Dict[int, float]] = None
|
|
90
|
+
if timestamps_supported:
|
|
91
|
+
last_ping_time_dict = {}
|
|
92
|
+
|
|
93
|
+
# Use an Event to signal when websocket is closed
|
|
94
|
+
websocket_closed_event = asyncio.Event()
|
|
95
|
+
websocket_lock = asyncio.Lock()
|
|
78
96
|
|
|
79
|
-
await asyncio.gather(
|
|
80
|
-
|
|
97
|
+
await asyncio.gather(
|
|
98
|
+
stdin_to_websocket(stdin_reader, websocket,
|
|
99
|
+
timestamps_supported, websocket_closed_event,
|
|
100
|
+
websocket_lock),
|
|
101
|
+
websocket_to_stdout(websocket, stdout_writer,
|
|
102
|
+
timestamps_supported, last_ping_time_dict,
|
|
103
|
+
websocket_closed_event, websocket_lock),
|
|
104
|
+
latency_monitor(websocket, last_ping_time_dict,
|
|
105
|
+
websocket_closed_event, websocket_lock),
|
|
106
|
+
return_exceptions=True)
|
|
81
107
|
finally:
|
|
82
108
|
if old_settings:
|
|
83
109
|
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
|
84
110
|
old_settings)
|
|
85
111
|
|
|
86
112
|
|
|
113
|
+
async def latency_monitor(websocket: ClientConnection,
|
|
114
|
+
last_ping_time_dict: Optional[dict],
|
|
115
|
+
websocket_closed_event: asyncio.Event,
|
|
116
|
+
websocket_lock: asyncio.Lock):
|
|
117
|
+
"""Periodically send PING messages (type 1) to measure latency."""
|
|
118
|
+
if last_ping_time_dict is None:
|
|
119
|
+
return
|
|
120
|
+
next_id = 0
|
|
121
|
+
while not websocket_closed_event.is_set():
|
|
122
|
+
try:
|
|
123
|
+
await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
|
|
124
|
+
if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
|
|
125
|
+
# We are not getting responses, clear the dictionary so
|
|
126
|
+
# as not to grow unbounded.
|
|
127
|
+
last_ping_time_dict.clear()
|
|
128
|
+
ping_time = time.time()
|
|
129
|
+
next_id += 1
|
|
130
|
+
last_ping_time_dict[next_id] = ping_time
|
|
131
|
+
message_header_bytes = struct.pack(
|
|
132
|
+
'!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
|
|
133
|
+
try:
|
|
134
|
+
async with websocket_lock:
|
|
135
|
+
await websocket.send(message_header_bytes)
|
|
136
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
137
|
+
# Websocket is already closed.
|
|
138
|
+
print(f'Failed to send PING message: {e}', file=sys.stderr)
|
|
139
|
+
break
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f'Error in latency_monitor: {e}', file=sys.stderr)
|
|
142
|
+
websocket_closed_event.set()
|
|
143
|
+
raise e
|
|
144
|
+
|
|
145
|
+
|
|
87
146
|
async def stdin_to_websocket(reader: asyncio.StreamReader,
|
|
88
|
-
websocket: ClientConnection
|
|
147
|
+
websocket: ClientConnection,
|
|
148
|
+
timestamps_supported: bool,
|
|
149
|
+
websocket_closed_event: asyncio.Event,
|
|
150
|
+
websocket_lock: asyncio.Lock):
|
|
89
151
|
try:
|
|
90
|
-
while
|
|
152
|
+
while not websocket_closed_event.is_set():
|
|
91
153
|
# Read at most BUFFER_SIZE bytes, this not affect
|
|
92
154
|
# responsiveness since it will return as soon as
|
|
93
155
|
# there is at least one byte.
|
|
94
156
|
# The BUFFER_SIZE is chosen to be large enough to improve
|
|
95
157
|
# throughput.
|
|
96
158
|
data = await reader.read(BUFFER_SIZE)
|
|
159
|
+
|
|
97
160
|
if not data:
|
|
98
161
|
break
|
|
99
|
-
|
|
162
|
+
if timestamps_supported:
|
|
163
|
+
# Send message with type 0 to indicate data.
|
|
164
|
+
message_type_bytes = struct.pack(
|
|
165
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
166
|
+
data = message_type_bytes + data
|
|
167
|
+
async with websocket_lock:
|
|
168
|
+
await websocket.send(data)
|
|
169
|
+
|
|
100
170
|
except Exception as e: # pylint: disable=broad-except
|
|
101
171
|
print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
|
|
102
172
|
finally:
|
|
103
|
-
|
|
173
|
+
async with websocket_lock:
|
|
174
|
+
await websocket.close()
|
|
175
|
+
websocket_closed_event.set()
|
|
104
176
|
|
|
105
177
|
|
|
106
178
|
async def websocket_to_stdout(websocket: ClientConnection,
|
|
107
|
-
writer: asyncio.StreamWriter
|
|
179
|
+
writer: asyncio.StreamWriter,
|
|
180
|
+
timestamps_supported: bool,
|
|
181
|
+
last_ping_time_dict: Optional[dict],
|
|
182
|
+
websocket_closed_event: asyncio.Event,
|
|
183
|
+
websocket_lock: asyncio.Lock):
|
|
108
184
|
try:
|
|
109
|
-
while
|
|
185
|
+
while not websocket_closed_event.is_set():
|
|
110
186
|
message = await websocket.recv()
|
|
187
|
+
if (timestamps_supported and len(message) > 0 and
|
|
188
|
+
last_ping_time_dict is not None):
|
|
189
|
+
message_type = struct.unpack('!B', message[:1])[0]
|
|
190
|
+
if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
|
|
191
|
+
# Regular data - strip type byte and write to stdout
|
|
192
|
+
message = message[1:]
|
|
193
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG.value:
|
|
194
|
+
# PONG response - calculate latency and send measurement
|
|
195
|
+
if not len(message) == struct.calcsize('!BI'):
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f'Invalid PONG message length: {len(message)}')
|
|
198
|
+
pong_id = struct.unpack('!I', message[1:5])[0]
|
|
199
|
+
pong_time = time.time()
|
|
200
|
+
|
|
201
|
+
ping_time = last_ping_time_dict.pop(pong_id, None)
|
|
202
|
+
|
|
203
|
+
if ping_time is None:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
latency_seconds = pong_time - ping_time
|
|
207
|
+
latency_ms = int(latency_seconds * 1000)
|
|
208
|
+
|
|
209
|
+
# Send latency measurement (type 2)
|
|
210
|
+
message_type_bytes = struct.pack(
|
|
211
|
+
'!B',
|
|
212
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
|
|
213
|
+
latency_bytes = struct.pack('!Q', latency_ms)
|
|
214
|
+
message = message_type_bytes + latency_bytes
|
|
215
|
+
# Send to server.
|
|
216
|
+
async with websocket_lock:
|
|
217
|
+
await websocket.send(message)
|
|
218
|
+
continue
|
|
219
|
+
# No timestamps support, write directly
|
|
111
220
|
writer.write(message)
|
|
112
221
|
await writer.drain()
|
|
113
222
|
except websockets.exceptions.ConnectionClosed:
|
|
114
223
|
print('WebSocket connection closed', file=sys.stderr)
|
|
115
224
|
except Exception as e: # pylint: disable=broad-except
|
|
116
225
|
print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
|
|
226
|
+
raise e
|
|
227
|
+
finally:
|
|
228
|
+
async with websocket_lock:
|
|
229
|
+
await websocket.close()
|
|
230
|
+
websocket_closed_event.set()
|
|
117
231
|
|
|
118
232
|
|
|
119
233
|
if __name__ == '__main__':
|
|
@@ -123,11 +237,25 @@ if __name__ == '__main__':
|
|
|
123
237
|
# TODO(aylei): Remove this after 0.10.0
|
|
124
238
|
server_url = f'http://{server_url}'
|
|
125
239
|
|
|
240
|
+
health_url = f'{server_url}/api/health'
|
|
241
|
+
health_response = requests.get(health_url)
|
|
242
|
+
health_data = health_response.json()
|
|
243
|
+
timestamps_are_supported = int(health_data['api_version']) > 21
|
|
244
|
+
disable_latency_measurement = os.environ.get(
|
|
245
|
+
skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
|
|
246
|
+
timestamps_are_supported = (timestamps_are_supported and
|
|
247
|
+
not disable_latency_measurement)
|
|
248
|
+
|
|
126
249
|
server_proto, server_fqdn = server_url.split('://')
|
|
127
250
|
websocket_proto = 'ws'
|
|
128
251
|
if server_proto == 'https':
|
|
129
252
|
websocket_proto = 'wss'
|
|
130
253
|
server_url = f'{websocket_proto}://{server_fqdn}'
|
|
254
|
+
|
|
255
|
+
client_version_str = (f'&client_version={constants.API_VERSION}'
|
|
256
|
+
if timestamps_are_supported else '')
|
|
257
|
+
|
|
131
258
|
websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
|
|
132
|
-
f'?cluster_name={sys.argv[2]}'
|
|
133
|
-
|
|
259
|
+
f'?cluster_name={sys.argv[2]}'
|
|
260
|
+
f'{client_version_str}')
|
|
261
|
+
asyncio.run(main(websocket_url, timestamps_are_supported))
|
sky/users/permission.py
CHANGED
|
@@ -43,7 +43,6 @@ class PermissionService:
|
|
|
43
43
|
with _policy_lock():
|
|
44
44
|
global _enforcer_instance
|
|
45
45
|
if _enforcer_instance is None:
|
|
46
|
-
_enforcer_instance = self
|
|
47
46
|
engine = global_user_state.initialize_and_get_db()
|
|
48
47
|
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
49
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
@@ -53,6 +52,10 @@ class PermissionService:
|
|
|
53
52
|
'model.conf')
|
|
54
53
|
enforcer = casbin.Enforcer(model_path, adapter)
|
|
55
54
|
self.enforcer = enforcer
|
|
55
|
+
# Only set the enforcer instance once the enforcer
|
|
56
|
+
# is successfully initialized, if we change it and then fail
|
|
57
|
+
# we will set it to None and all subsequent calls will fail.
|
|
58
|
+
_enforcer_instance = self
|
|
56
59
|
self._maybe_initialize_policies()
|
|
57
60
|
self._maybe_initialize_basic_auth_user()
|
|
58
61
|
else:
|
|
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
|
|
|
282
282
|
if resources_str_full is not None:
|
|
283
283
|
resources_str = resources_str_full
|
|
284
284
|
if resources_str is None:
|
|
285
|
-
|
|
286
|
-
|
|
285
|
+
resources_str_simple, resources_str_full = (
|
|
286
|
+
resources_utils.get_readable_resources_repr(
|
|
287
|
+
handle, simplified_only=truncate))
|
|
288
|
+
if truncate:
|
|
289
|
+
resources_str = resources_str_simple
|
|
290
|
+
else:
|
|
291
|
+
assert resources_str_full is not None
|
|
292
|
+
resources_str = resources_str_full
|
|
287
293
|
|
|
288
294
|
return resources_str
|
|
289
295
|
return '-'
|
sky/utils/context_utils.py
CHANGED
|
@@ -8,6 +8,7 @@ import multiprocessing
|
|
|
8
8
|
import os
|
|
9
9
|
import subprocess
|
|
10
10
|
import sys
|
|
11
|
+
import time
|
|
11
12
|
import typing
|
|
12
13
|
from typing import Any, Callable, IO, Optional, Tuple, TypeVar
|
|
13
14
|
|
|
@@ -18,6 +19,7 @@ from sky.utils import context
|
|
|
18
19
|
from sky.utils import subprocess_utils
|
|
19
20
|
|
|
20
21
|
StreamHandler = Callable[[IO[Any], IO[Any]], str]
|
|
22
|
+
PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
|
|
21
23
|
|
|
22
24
|
logger = sky_logging.init_logger(__name__)
|
|
23
25
|
|
|
@@ -46,6 +48,7 @@ def hijack_sys_attrs():
|
|
|
46
48
|
|
|
47
49
|
def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
48
50
|
"""Passthrough the stream from the process to the output stream"""
|
|
51
|
+
last_flush_time = time.time()
|
|
49
52
|
wrapped = io.TextIOWrapper(in_stream,
|
|
50
53
|
encoding='utf-8',
|
|
51
54
|
newline='',
|
|
@@ -55,9 +58,18 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
|
55
58
|
line = wrapped.readline()
|
|
56
59
|
if line:
|
|
57
60
|
out_stream.write(line)
|
|
58
|
-
|
|
61
|
+
|
|
62
|
+
# Flush based on timeout instead of on every line
|
|
63
|
+
current_time = time.time()
|
|
64
|
+
if (current_time - last_flush_time >=
|
|
65
|
+
PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
|
|
66
|
+
out_stream.flush()
|
|
67
|
+
last_flush_time = current_time
|
|
59
68
|
else:
|
|
60
69
|
break
|
|
70
|
+
|
|
71
|
+
# Final flush to ensure all data is written
|
|
72
|
+
out_stream.flush()
|
|
61
73
|
return ''
|
|
62
74
|
|
|
63
75
|
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -22,7 +22,7 @@ GLOBAL_USER_STATE_VERSION = '010'
|
|
|
22
22
|
GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
|
|
23
23
|
|
|
24
24
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
25
|
-
SPOT_JOBS_VERSION = '
|
|
25
|
+
SPOT_JOBS_VERSION = '005'
|
|
26
26
|
SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
|
|
27
27
|
|
|
28
28
|
SERVE_DB_NAME = 'serve_db'
|
sky/utils/resource_checker.py
CHANGED
|
@@ -278,7 +278,10 @@ def _get_active_resources(
|
|
|
278
278
|
from sky.jobs.server import core as managed_jobs_core
|
|
279
279
|
try:
|
|
280
280
|
filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
|
|
281
|
-
refresh=False,
|
|
281
|
+
refresh=False,
|
|
282
|
+
skip_finished=True,
|
|
283
|
+
all_users=True,
|
|
284
|
+
fields=['job_id', 'user_hash', 'workspace'])
|
|
282
285
|
return filtered_jobs
|
|
283
286
|
except exceptions.ClusterNotUpError:
|
|
284
287
|
logger.warning('All jobs should be finished.')
|
sky/utils/resources_utils.py
CHANGED
|
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
|
|
|
181
181
|
|
|
182
182
|
|
|
183
183
|
def format_resource(resource: 'resources_lib.Resources',
|
|
184
|
-
|
|
184
|
+
simplified_only: bool = False) -> Tuple[str, Optional[str]]:
|
|
185
185
|
resource = resource.assert_launchable()
|
|
186
|
-
|
|
187
|
-
|
|
186
|
+
is_k8s = str(resource.cloud).lower() == 'kubernetes'
|
|
187
|
+
if resource.accelerators is None or is_k8s or not simplified_only:
|
|
188
|
+
vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
|
|
189
|
+
resource.instance_type)
|
|
188
190
|
|
|
189
|
-
|
|
191
|
+
elements_simple = []
|
|
192
|
+
elements_full = []
|
|
190
193
|
|
|
191
194
|
if resource.accelerators is not None:
|
|
192
195
|
acc, count = list(resource.accelerators.items())[0]
|
|
193
|
-
|
|
196
|
+
elements_simple.append(f'gpus={acc}:{count}')
|
|
197
|
+
elements_full.append(f'gpus={acc}:{count}')
|
|
194
198
|
|
|
195
|
-
|
|
196
|
-
|
|
199
|
+
if (resource.accelerators is None or is_k8s):
|
|
200
|
+
if vcpu is not None:
|
|
201
|
+
elements_simple.append(f'cpus={int(vcpu)}')
|
|
202
|
+
elements_full.append(f'cpus={int(vcpu)}')
|
|
203
|
+
if mem is not None:
|
|
204
|
+
elements_simple.append(f'mem={int(mem)}')
|
|
205
|
+
elements_full.append(f'mem={int(mem)}')
|
|
206
|
+
elif not simplified_only:
|
|
197
207
|
if vcpu is not None:
|
|
198
|
-
|
|
208
|
+
elements_full.append(f'cpus={int(vcpu)}')
|
|
199
209
|
if mem is not None:
|
|
200
|
-
|
|
210
|
+
elements_full.append(f'mem={int(mem)}')
|
|
201
211
|
|
|
202
|
-
instance_type = resource.instance_type
|
|
203
|
-
if simplify:
|
|
204
|
-
instance_type = common_utils.truncate_long_string(instance_type, 15)
|
|
205
212
|
if not is_k8s:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
213
|
+
instance_type_full = resource.instance_type
|
|
214
|
+
instance_type_simple = common_utils.truncate_long_string(
|
|
215
|
+
instance_type_full, 15)
|
|
216
|
+
elements_simple.append(instance_type_simple)
|
|
217
|
+
elements_full.append(instance_type_full)
|
|
218
|
+
elements_simple.append('...')
|
|
219
|
+
if not simplified_only:
|
|
210
220
|
image_id = resource.image_id
|
|
211
221
|
if image_id is not None:
|
|
212
222
|
if None in image_id:
|
|
213
|
-
|
|
223
|
+
elements_full.append(f'image_id={image_id[None]}')
|
|
214
224
|
else:
|
|
215
|
-
|
|
216
|
-
|
|
225
|
+
elements_full.append(f'image_id={image_id}')
|
|
226
|
+
elements_full.append(f'disk={resource.disk_size}')
|
|
217
227
|
disk_tier = resource.disk_tier
|
|
218
228
|
if disk_tier is not None:
|
|
219
|
-
|
|
229
|
+
elements_full.append(f'disk_tier={disk_tier.value}')
|
|
220
230
|
ports = resource.ports
|
|
221
231
|
if ports is not None:
|
|
222
|
-
|
|
232
|
+
elements_full.append(f'ports={ports}')
|
|
223
233
|
|
|
224
234
|
spot = '[spot]' if resource.use_spot else ''
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
235
|
+
resources_str_simple = (
|
|
236
|
+
f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
|
|
237
|
+
if simplified_only:
|
|
238
|
+
return resources_str_simple, None
|
|
239
|
+
else:
|
|
240
|
+
resources_str_full = (
|
|
241
|
+
f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
|
|
242
|
+
return resources_str_simple, resources_str_full
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def get_readable_resources_repr(
|
|
246
|
+
handle: 'backends.CloudVmRayResourceHandle',
|
|
247
|
+
simplified_only: bool = False) -> Tuple[str, Optional[str]]:
|
|
248
|
+
resource_str_simple, resource_str_full = format_resource(
|
|
249
|
+
handle.launched_resources, simplified_only)
|
|
250
|
+
if not simplified_only:
|
|
251
|
+
assert resource_str_full is not None
|
|
230
252
|
if (handle.launched_nodes is not None and
|
|
231
253
|
handle.launched_resources is not None):
|
|
232
|
-
return (f'{handle.launched_nodes}x'
|
|
233
|
-
|
|
234
|
-
|
|
254
|
+
return (f'{handle.launched_nodes}x{resource_str_simple}',
|
|
255
|
+
None if simplified_only else
|
|
256
|
+
f'{handle.launched_nodes}x{resource_str_full}')
|
|
257
|
+
return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
|
|
258
|
+
_DEFAULT_MESSAGE_HANDLE_INITIALIZING)
|
|
235
259
|
|
|
236
260
|
|
|
237
261
|
def make_ray_custom_resources_str(
|
sky/utils/schemas.py
CHANGED
|
@@ -1190,7 +1190,13 @@ def get_config_schema():
|
|
|
1190
1190
|
'consolidation_mode': {
|
|
1191
1191
|
'type': 'boolean',
|
|
1192
1192
|
'default': False,
|
|
1193
|
-
}
|
|
1193
|
+
},
|
|
1194
|
+
'controller_logs_gc_retention_hours': {
|
|
1195
|
+
'type': 'integer',
|
|
1196
|
+
},
|
|
1197
|
+
'task_logs_gc_retention_hours': {
|
|
1198
|
+
'type': 'integer',
|
|
1199
|
+
},
|
|
1194
1200
|
},
|
|
1195
1201
|
},
|
|
1196
1202
|
'bucket': {
|
|
@@ -1592,10 +1598,10 @@ def get_config_schema():
|
|
|
1592
1598
|
|
|
1593
1599
|
allowed_workspace_cloud_names = list(constants.ALL_CLOUDS) + ['cloudflare']
|
|
1594
1600
|
# Create pattern for not supported clouds, i.e.
|
|
1595
|
-
# all clouds except gcp, kubernetes, ssh
|
|
1601
|
+
# all clouds except aws, gcp, kubernetes, ssh, nebius
|
|
1596
1602
|
not_supported_clouds = [
|
|
1597
1603
|
cloud for cloud in allowed_workspace_cloud_names
|
|
1598
|
-
if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
|
|
1604
|
+
if cloud.lower() not in ['aws', 'gcp', 'kubernetes', 'ssh', 'nebius']
|
|
1599
1605
|
]
|
|
1600
1606
|
not_supported_cloud_regex = '|'.join(not_supported_clouds)
|
|
1601
1607
|
workspaces_schema = {
|
|
@@ -1606,7 +1612,8 @@ def get_config_schema():
|
|
|
1606
1612
|
'type': 'object',
|
|
1607
1613
|
'additionalProperties': False,
|
|
1608
1614
|
'patternProperties': {
|
|
1609
|
-
# Pattern for
|
|
1615
|
+
# Pattern for clouds with no workspace-specific config -
|
|
1616
|
+
# only allow 'disabled' property.
|
|
1610
1617
|
f'^({not_supported_cloud_regex})$': {
|
|
1611
1618
|
'type': 'object',
|
|
1612
1619
|
'additionalProperties': False,
|
|
@@ -1641,6 +1648,18 @@ def get_config_schema():
|
|
|
1641
1648
|
},
|
|
1642
1649
|
'additionalProperties': False,
|
|
1643
1650
|
},
|
|
1651
|
+
'aws': {
|
|
1652
|
+
'type': 'object',
|
|
1653
|
+
'properties': {
|
|
1654
|
+
'profile': {
|
|
1655
|
+
'type': 'string'
|
|
1656
|
+
},
|
|
1657
|
+
'disabled': {
|
|
1658
|
+
'type': 'boolean'
|
|
1659
|
+
},
|
|
1660
|
+
},
|
|
1661
|
+
'additionalProperties': False,
|
|
1662
|
+
},
|
|
1644
1663
|
'ssh': {
|
|
1645
1664
|
'type': 'object',
|
|
1646
1665
|
'required': [],
|