skypilot-nightly 1.0.0.dev20250908__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +35 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +18 -16
- sky/clouds/aws.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-47c64cc05717f8a3.js → webpack-1d7e11230da3ca89.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +732 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +702 -511
- sky/jobs/utils.py +94 -39
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +43 -24
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/metrics.py +60 -9
- sky/server/requests/executor.py +33 -32
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +57 -4
- sky/server/uvicorn.py +4 -0
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +3 -1
- sky/skylet/events.py +2 -10
- sky/utils/command_runner.pyi +3 -3
- sky/utils/common_utils.py +19 -7
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +37 -35
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +70 -70
- /sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/server/daemons.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.utils import annotations
|
|
|
11
11
|
from sky.utils import common
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import env_options
|
|
14
|
+
from sky.utils import subprocess_utils
|
|
14
15
|
from sky.utils import timeline
|
|
15
16
|
from sky.utils import ux_utils
|
|
16
17
|
|
|
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
|
|
|
74
75
|
# using too much memory.
|
|
75
76
|
annotations.clear_request_level_cache()
|
|
76
77
|
timeline.save_timeline()
|
|
78
|
+
# Kill all children processes related to this request.
|
|
79
|
+
# Each executor handles a single request, so we can safely
|
|
80
|
+
# kill all children processes related to this request.
|
|
81
|
+
subprocess_utils.kill_children_processes()
|
|
77
82
|
common_utils.release_memory()
|
|
78
83
|
except Exception: # pylint: disable=broad-except
|
|
79
84
|
# It is OK to fail to run the event, as the event is not
|
|
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
|
|
|
123
128
|
"""Refresh the managed job status for controller consolidation mode."""
|
|
124
129
|
# pylint: disable=import-outside-toplevel
|
|
125
130
|
from sky.jobs import utils as managed_job_utils
|
|
126
|
-
from sky.utils import controller_utils
|
|
127
131
|
|
|
128
132
|
# We run the recovery logic before starting the event loop as those two are
|
|
129
133
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
130
|
-
|
|
131
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
132
|
-
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
134
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
133
135
|
|
|
134
136
|
# After recovery, we start the event loop.
|
|
135
137
|
from sky.skylet import events
|
|
136
138
|
refresh_event = events.ManagedJobEvent()
|
|
137
|
-
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
138
139
|
logger.info('=== Running managed job event ===')
|
|
139
140
|
refresh_event.run()
|
|
140
|
-
scheduling_event.run()
|
|
141
141
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
142
142
|
|
|
143
143
|
|
|
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
|
|
|
152
152
|
"""Refresh the sky serve status for controller consolidation mode."""
|
|
153
153
|
# pylint: disable=import-outside-toplevel
|
|
154
154
|
from sky.serve import serve_utils
|
|
155
|
-
from sky.utils import controller_utils
|
|
156
155
|
|
|
157
156
|
# We run the recovery logic before starting the event loop as those two are
|
|
158
157
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
159
|
-
|
|
160
|
-
if controller_utils.high_availability_specified(
|
|
161
|
-
controller.value.cluster_name):
|
|
162
|
-
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
158
|
+
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
163
159
|
|
|
164
160
|
# After recovery, we start the event loop.
|
|
165
161
|
from sky.skylet import events
|
sky/server/metrics.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import functools
|
|
5
|
+
import multiprocessing
|
|
5
6
|
import os
|
|
6
7
|
import time
|
|
7
8
|
|
|
@@ -9,6 +10,7 @@ import fastapi
|
|
|
9
10
|
from prometheus_client import generate_latest
|
|
10
11
|
from prometheus_client import multiprocess
|
|
11
12
|
import prometheus_client as prom
|
|
13
|
+
import psutil
|
|
12
14
|
import starlette.middleware.base
|
|
13
15
|
import uvicorn
|
|
14
16
|
|
|
@@ -38,15 +40,6 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
|
38
40
|
60.0, 120.0, float('inf')),
|
|
39
41
|
)
|
|
40
42
|
|
|
41
|
-
# Time spent processing requests in executor.
|
|
42
|
-
SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
|
|
43
|
-
'sky_apiserver_request_execution_duration_seconds',
|
|
44
|
-
'Time spent executing requests in executor',
|
|
45
|
-
['request', 'worker'],
|
|
46
|
-
buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
|
|
47
|
-
float('inf')),
|
|
48
|
-
)
|
|
49
|
-
|
|
50
43
|
# Time spent processing a piece of code, refer to time_it().
|
|
51
44
|
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
52
45
|
'sky_apiserver_code_duration_seconds',
|
|
@@ -64,6 +57,41 @@ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
|
64
57
|
60.0, float('inf')),
|
|
65
58
|
)
|
|
66
59
|
|
|
60
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
61
|
+
'sky_apiserver_websocket_connections',
|
|
62
|
+
'Number of websocket connections',
|
|
63
|
+
['pid'],
|
|
64
|
+
multiprocess_mode='livesum',
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
68
|
+
'sky_apiserver_websocket_closed_total',
|
|
69
|
+
'Number of websocket closed',
|
|
70
|
+
['pid', 'reason'],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# The number of execution starts in each worker process, we do not record
|
|
74
|
+
# histogram here as the duration has been measured in
|
|
75
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
76
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
77
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
78
|
+
'sky_apiserver_process_execution_start_total',
|
|
79
|
+
'Total number of execution starts in each worker process',
|
|
80
|
+
['request', 'pid'],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
84
|
+
'sky_apiserver_process_peak_rss',
|
|
85
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
86
|
+
['pid', 'type'],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
90
|
+
'sky_apiserver_process_cpu_total',
|
|
91
|
+
'Total CPU times a worker process has been running',
|
|
92
|
+
['pid', 'type', 'mode'],
|
|
93
|
+
)
|
|
94
|
+
|
|
67
95
|
metrics_app = fastapi.FastAPI()
|
|
68
96
|
|
|
69
97
|
|
|
@@ -178,3 +206,26 @@ def time_me_async(func):
|
|
|
178
206
|
return await func(*args, **kwargs)
|
|
179
207
|
|
|
180
208
|
return async_wrapper
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def process_monitor(process_type: str):
|
|
212
|
+
pid = multiprocessing.current_process().pid
|
|
213
|
+
proc = psutil.Process(pid)
|
|
214
|
+
peak_rss = 0
|
|
215
|
+
last_bucket_end = time.time()
|
|
216
|
+
while True:
|
|
217
|
+
if time.time() - last_bucket_end >= 30:
|
|
218
|
+
# Reset peak RSS every 30 seconds.
|
|
219
|
+
last_bucket_end = time.time()
|
|
220
|
+
peak_rss = 0
|
|
221
|
+
peak_rss = max(peak_rss, proc.memory_info().rss)
|
|
222
|
+
SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
|
|
223
|
+
type=process_type).set(peak_rss)
|
|
224
|
+
ctimes = proc.cpu_times()
|
|
225
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
226
|
+
type=process_type,
|
|
227
|
+
mode='user').set(ctimes.user)
|
|
228
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
229
|
+
type=process_type,
|
|
230
|
+
mode='system').set(ctimes.system)
|
|
231
|
+
time.sleep(1)
|
sky/server/requests/executor.py
CHANGED
|
@@ -130,6 +130,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
130
130
|
def executor_initializer(proc_group: str):
|
|
131
131
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
132
132
|
f'{multiprocessing.current_process().pid}')
|
|
133
|
+
threading.Thread(target=metrics_lib.process_monitor,
|
|
134
|
+
args=(f'worker:{proc_group}',),
|
|
135
|
+
daemon=True).start()
|
|
133
136
|
|
|
134
137
|
|
|
135
138
|
class RequestWorker:
|
|
@@ -281,34 +284,34 @@ def override_request_env_and_config(
|
|
|
281
284
|
request_id: str) -> Generator[None, None, None]:
|
|
282
285
|
"""Override the environment and SkyPilot config for a request."""
|
|
283
286
|
original_env = os.environ.copy()
|
|
284
|
-
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
|
|
285
|
-
# affecting client requests. If set on the client side, it will be
|
|
286
|
-
# overridden by the request body.
|
|
287
|
-
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
288
|
-
# Remove the db connection uri from client supplied env vars, as the
|
|
289
|
-
# client should not set the db string on server side.
|
|
290
|
-
request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
|
|
291
|
-
os.environ.update(request_body.env_vars)
|
|
292
|
-
# Note: may be overridden by AuthProxyMiddleware.
|
|
293
|
-
# TODO(zhwu): we need to make the entire request a context available to the
|
|
294
|
-
# entire request execution, so that we can access info like user through
|
|
295
|
-
# the execution.
|
|
296
|
-
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
297
|
-
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
298
|
-
global_user_state.add_or_update_user(user)
|
|
299
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
300
|
-
# field.
|
|
301
|
-
user = global_user_state.get_user(user.id)
|
|
302
|
-
|
|
303
|
-
# Force color to be enabled.
|
|
304
|
-
os.environ['CLICOLOR_FORCE'] = '1'
|
|
305
|
-
server_common.reload_for_new_request(
|
|
306
|
-
client_entrypoint=request_body.entrypoint,
|
|
307
|
-
client_command=request_body.entrypoint_command,
|
|
308
|
-
using_remote_api_server=request_body.using_remote_api_server,
|
|
309
|
-
user=user,
|
|
310
|
-
request_id=request_id)
|
|
311
287
|
try:
|
|
288
|
+
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
|
|
289
|
+
# server affecting client requests. If set on the client side, it will
|
|
290
|
+
# be overridden by the request body.
|
|
291
|
+
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
292
|
+
# Remove the db connection uri from client supplied env vars, as the
|
|
293
|
+
# client should not set the db string on server side.
|
|
294
|
+
request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
|
|
295
|
+
os.environ.update(request_body.env_vars)
|
|
296
|
+
# Note: may be overridden by AuthProxyMiddleware.
|
|
297
|
+
# TODO(zhwu): we need to make the entire request a context available to
|
|
298
|
+
# the entire request execution, so that we can access info like user
|
|
299
|
+
# through the execution.
|
|
300
|
+
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
301
|
+
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
302
|
+
global_user_state.add_or_update_user(user)
|
|
303
|
+
# Refetch the user to get the latest user info, including the created_at
|
|
304
|
+
# field.
|
|
305
|
+
user = global_user_state.get_user(user.id)
|
|
306
|
+
|
|
307
|
+
# Force color to be enabled.
|
|
308
|
+
os.environ['CLICOLOR_FORCE'] = '1'
|
|
309
|
+
server_common.reload_for_new_request(
|
|
310
|
+
client_entrypoint=request_body.entrypoint,
|
|
311
|
+
client_command=request_body.entrypoint_command,
|
|
312
|
+
using_remote_api_server=request_body.using_remote_api_server,
|
|
313
|
+
user=user,
|
|
314
|
+
request_id=request_id)
|
|
312
315
|
logger.debug(
|
|
313
316
|
f'override path: {request_body.override_skypilot_config_path}')
|
|
314
317
|
with skypilot_config.override_skypilot_config(
|
|
@@ -401,6 +404,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
401
404
|
config = skypilot_config.to_dict()
|
|
402
405
|
logger.debug(f'request config: \n'
|
|
403
406
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
407
|
+
metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
|
|
408
|
+
request=request_name, pid=pid).inc()
|
|
404
409
|
with metrics_lib.time_it(name=request_name,
|
|
405
410
|
group='request_execution'):
|
|
406
411
|
return_value = func(**request_body.to_kwargs())
|
|
@@ -439,11 +444,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
439
444
|
logger.info(f'Request {request_id} finished')
|
|
440
445
|
finally:
|
|
441
446
|
with metrics_lib.time_it(name='release_memory', group='internal'):
|
|
442
|
-
|
|
443
|
-
common_utils.release_memory()
|
|
444
|
-
except Exception as e: # pylint: disable=broad-except
|
|
445
|
-
logger.error(f'Failed to release memory: '
|
|
446
|
-
f'{common_utils.format_exception(e)}')
|
|
447
|
+
common_utils.release_memory()
|
|
447
448
|
|
|
448
449
|
|
|
449
450
|
async def execute_request_coroutine(request: api_requests.Request):
|
|
@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
|
|
|
131
131
|
def encode_jobs_queue_v2(
|
|
132
132
|
jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
|
133
133
|
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
134
|
-
status_counts = {}
|
|
134
|
+
status_counts: Dict[str, int] = {}
|
|
135
135
|
if isinstance(jobs_or_tuple, tuple):
|
|
136
136
|
if len(jobs_or_tuple) == 2:
|
|
137
137
|
jobs, total = jobs_or_tuple
|
sky/server/server.py
CHANGED
|
@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
|
|
625
625
|
app.include_router(ssh_node_pools_rest.router,
|
|
626
626
|
prefix='/ssh_node_pools',
|
|
627
627
|
tags=['ssh_node_pools'])
|
|
628
|
+
# increase the resource limit for the server
|
|
629
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
630
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
628
631
|
|
|
629
632
|
# Increase the limit of files we can open to our hard limit. This fixes bugs
|
|
630
633
|
# where we can not aquire file locks or open enough logs and the API server
|
|
@@ -1734,7 +1737,12 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1734
1737
|
return
|
|
1735
1738
|
|
|
1736
1739
|
logger.info(f'Starting port-forward to local port: {local_port}')
|
|
1740
|
+
conn_gauge = metrics.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
|
|
1741
|
+
pid=os.getpid())
|
|
1742
|
+
ssh_failed = False
|
|
1743
|
+
websocket_closed = False
|
|
1737
1744
|
try:
|
|
1745
|
+
conn_gauge.inc()
|
|
1738
1746
|
# Connect to the local port
|
|
1739
1747
|
reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
|
|
1740
1748
|
|
|
@@ -1742,9 +1750,21 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1742
1750
|
try:
|
|
1743
1751
|
async for message in websocket.iter_bytes():
|
|
1744
1752
|
writer.write(message)
|
|
1745
|
-
|
|
1753
|
+
try:
|
|
1754
|
+
await writer.drain()
|
|
1755
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1756
|
+
# Typically we will not reach here, if the ssh to pod
|
|
1757
|
+
# is disconnected, ssh_to_websocket will exit first.
|
|
1758
|
+
# But just in case.
|
|
1759
|
+
logger.error('Failed to write to pod through '
|
|
1760
|
+
f'port-forward connection: {e}')
|
|
1761
|
+
nonlocal ssh_failed
|
|
1762
|
+
ssh_failed = True
|
|
1763
|
+
break
|
|
1746
1764
|
except fastapi.WebSocketDisconnect:
|
|
1747
1765
|
pass
|
|
1766
|
+
nonlocal websocket_closed
|
|
1767
|
+
websocket_closed = True
|
|
1748
1768
|
writer.close()
|
|
1749
1769
|
|
|
1750
1770
|
async def ssh_to_websocket():
|
|
@@ -1752,15 +1772,44 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1752
1772
|
while True:
|
|
1753
1773
|
data = await reader.read(1024)
|
|
1754
1774
|
if not data:
|
|
1775
|
+
if not websocket_closed:
|
|
1776
|
+
logger.warning('SSH connection to pod is '
|
|
1777
|
+
'disconnected before websocket '
|
|
1778
|
+
'connection is closed')
|
|
1779
|
+
nonlocal ssh_failed
|
|
1780
|
+
ssh_failed = True
|
|
1755
1781
|
break
|
|
1756
1782
|
await websocket.send_bytes(data)
|
|
1757
1783
|
except Exception: # pylint: disable=broad-except
|
|
1758
1784
|
pass
|
|
1759
|
-
|
|
1785
|
+
try:
|
|
1786
|
+
await websocket.close()
|
|
1787
|
+
except Exception: # pylint: disable=broad-except
|
|
1788
|
+
# The websocket might has been closed by the client.
|
|
1789
|
+
pass
|
|
1760
1790
|
|
|
1761
1791
|
await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
|
|
1762
1792
|
finally:
|
|
1763
|
-
|
|
1793
|
+
conn_gauge.dec()
|
|
1794
|
+
reason = ''
|
|
1795
|
+
try:
|
|
1796
|
+
logger.info('Terminating kubectl port-forward process')
|
|
1797
|
+
proc.terminate()
|
|
1798
|
+
except ProcessLookupError:
|
|
1799
|
+
stdout = await proc.stdout.read()
|
|
1800
|
+
logger.error('kubectl port-forward was terminated before the '
|
|
1801
|
+
'ssh websocket connection was closed. Remaining '
|
|
1802
|
+
f'output: {str(stdout)}')
|
|
1803
|
+
reason = 'KubectlPortForwardExit'
|
|
1804
|
+
metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
1805
|
+
pid=os.getpid(), reason='KubectlPortForwardExit').inc()
|
|
1806
|
+
else:
|
|
1807
|
+
if ssh_failed:
|
|
1808
|
+
reason = 'SSHToPodDisconnected'
|
|
1809
|
+
else:
|
|
1810
|
+
reason = 'ClientClosed'
|
|
1811
|
+
metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
1812
|
+
pid=os.getpid(), reason=reason).inc()
|
|
1764
1813
|
|
|
1765
1814
|
|
|
1766
1815
|
@app.get('/all_contexts')
|
|
@@ -1780,7 +1829,7 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1780
1829
|
async def gpu_metrics() -> fastapi.Response:
|
|
1781
1830
|
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1782
1831
|
contexts = core.get_all_contexts()
|
|
1783
|
-
all_metrics = []
|
|
1832
|
+
all_metrics: List[str] = []
|
|
1784
1833
|
successful_contexts = 0
|
|
1785
1834
|
|
|
1786
1835
|
tasks = [
|
|
@@ -1795,6 +1844,10 @@ async def gpu_metrics() -> fastapi.Response:
|
|
|
1795
1844
|
if isinstance(result, Exception):
|
|
1796
1845
|
logger.error(
|
|
1797
1846
|
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1847
|
+
elif isinstance(result, BaseException):
|
|
1848
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
1849
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
1850
|
+
raise result
|
|
1798
1851
|
else:
|
|
1799
1852
|
metrics_text = result
|
|
1800
1853
|
all_metrics.append(metrics_text)
|
sky/server/uvicorn.py
CHANGED
|
@@ -19,6 +19,7 @@ from uvicorn.supervisors import multiprocess
|
|
|
19
19
|
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.server import daemons
|
|
22
|
+
from sky.server import metrics as metrics_lib
|
|
22
23
|
from sky.server import state
|
|
23
24
|
from sky.server.requests import requests as requests_lib
|
|
24
25
|
from sky.skylet import constants
|
|
@@ -212,6 +213,9 @@ class Server(uvicorn.Server):
|
|
|
212
213
|
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
213
214
|
event_loop.set_debug(True)
|
|
214
215
|
event_loop.slow_callback_duration = lag_threshold
|
|
216
|
+
threading.Thread(target=metrics_lib.process_monitor,
|
|
217
|
+
args=('server',),
|
|
218
|
+
daemon=True).start()
|
|
215
219
|
with self.capture_signals():
|
|
216
220
|
asyncio.run(self.serve(*args, **kwargs))
|
|
217
221
|
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -63,6 +63,8 @@ install_requires = [
|
|
|
63
63
|
'setproctitle',
|
|
64
64
|
'sqlalchemy',
|
|
65
65
|
'psycopg2-binary',
|
|
66
|
+
'aiosqlite',
|
|
67
|
+
'asyncpg',
|
|
66
68
|
# TODO(hailong): These three dependencies should be removed after we make
|
|
67
69
|
# the client-side actually not importing them.
|
|
68
70
|
'casbin',
|
|
@@ -108,9 +110,9 @@ server_dependencies = [
|
|
|
108
110
|
local_ray = [
|
|
109
111
|
# Lower version of ray will cause dependency conflict for
|
|
110
112
|
# click/grpcio/protobuf.
|
|
111
|
-
#
|
|
113
|
+
# Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
|
|
112
114
|
# https://github.com/ray-project/ray/releases/tag/ray-2.6.1
|
|
113
|
-
'ray[default] >= 2.
|
|
115
|
+
'ray[default] >= 2.6.1',
|
|
114
116
|
]
|
|
115
117
|
|
|
116
118
|
remote = [
|
sky/skylet/attempt_skylet.py
CHANGED
|
@@ -12,6 +12,7 @@ def restart_skylet():
|
|
|
12
12
|
# Kills old skylet if it is running.
|
|
13
13
|
# TODO(zhwu): make the killing graceful, e.g., use a signal to tell
|
|
14
14
|
# skylet to exit, instead of directly killing it.
|
|
15
|
+
|
|
15
16
|
subprocess.run(
|
|
16
17
|
# We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
|
|
17
18
|
# because need to handle the backward compatibility of the old skylet
|
sky/skylet/constants.py
CHANGED
|
@@ -91,7 +91,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
91
91
|
# cluster yaml is updated.
|
|
92
92
|
#
|
|
93
93
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
94
|
-
SKYLET_VERSION = '
|
|
94
|
+
SKYLET_VERSION = '18'
|
|
95
95
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
96
96
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
97
97
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -374,6 +374,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
374
374
|
('ssh', 'pod_config'),
|
|
375
375
|
('kubernetes', 'custom_metadata'),
|
|
376
376
|
('kubernetes', 'pod_config'),
|
|
377
|
+
('kubernetes', 'context_configs'),
|
|
377
378
|
('kubernetes', 'provision_timeout'),
|
|
378
379
|
('kubernetes', 'dws'),
|
|
379
380
|
('kubernetes', 'kueue'),
|
|
@@ -421,6 +422,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
421
422
|
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
422
423
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
423
424
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
425
|
+
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
424
426
|
|
|
425
427
|
# Environment variable that is set to 'true' if metrics are enabled.
|
|
426
428
|
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
sky/skylet/events.py
CHANGED
|
@@ -11,7 +11,7 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
-
from sky.jobs import scheduler
|
|
14
|
+
from sky.jobs import scheduler
|
|
15
15
|
from sky.jobs import state as managed_job_state
|
|
16
16
|
from sky.jobs import utils as managed_job_utils
|
|
17
17
|
from sky.serve import serve_utils
|
|
@@ -76,15 +76,7 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
76
76
|
def _run(self):
|
|
77
77
|
logger.info('=== Updating managed job status ===')
|
|
78
78
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class ManagedJobSchedulingEvent(SkyletEvent):
|
|
82
|
-
"""Skylet event for scheduling managed jobs."""
|
|
83
|
-
EVENT_INTERVAL_SECONDS = 20
|
|
84
|
-
|
|
85
|
-
def _run(self):
|
|
86
|
-
logger.info('=== Scheduling next jobs ===')
|
|
87
|
-
managed_job_scheduler.maybe_schedule_next_jobs()
|
|
79
|
+
scheduler.maybe_start_controllers()
|
|
88
80
|
|
|
89
81
|
|
|
90
82
|
class ServiceUpdateEvent(SkyletEvent):
|
sky/utils/command_runner.pyi
CHANGED
sky/utils/common_utils.py
CHANGED
|
@@ -996,7 +996,17 @@ def get_mem_size_gb() -> float:
|
|
|
996
996
|
except ValueError as e:
|
|
997
997
|
with ux_utils.print_exception_no_traceback():
|
|
998
998
|
raise ValueError(
|
|
999
|
-
f'Failed to parse the memory size from {mem_size}'
|
|
999
|
+
f'Failed to parse the memory size from {mem_size} (GB)'
|
|
1000
|
+
) from e
|
|
1001
|
+
mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
|
|
1002
|
+
if mem_size is not None:
|
|
1003
|
+
try:
|
|
1004
|
+
return float(mem_size) / (1024**3)
|
|
1005
|
+
except ValueError as e:
|
|
1006
|
+
with ux_utils.print_exception_no_traceback():
|
|
1007
|
+
raise ValueError(
|
|
1008
|
+
f'Failed to parse the memory size from {mem_size} (bytes)'
|
|
1009
|
+
) from e
|
|
1000
1010
|
return _mem_size_gb()
|
|
1001
1011
|
|
|
1002
1012
|
|
|
@@ -1098,13 +1108,15 @@ def release_memory():
|
|
|
1098
1108
|
"""Release the process memory"""
|
|
1099
1109
|
# Do the best effort to release the python heap and let malloc_trim
|
|
1100
1110
|
# be more efficient.
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1111
|
+
try:
|
|
1112
|
+
gc.collect()
|
|
1113
|
+
if sys.platform.startswith('linux'):
|
|
1104
1114
|
# Will fail on musl (alpine), but at least it works on our
|
|
1105
1115
|
# offical docker images.
|
|
1106
1116
|
libc = ctypes.CDLL('libc.so.6')
|
|
1107
1117
|
return libc.malloc_trim(0)
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1118
|
+
return 0
|
|
1119
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1120
|
+
logger.error(f'Failed to release memory: '
|
|
1121
|
+
f'{format_exception(e)}')
|
|
1122
|
+
return 0
|
sky/utils/controller_utils.py
CHANGED
|
@@ -228,6 +228,11 @@ def get_controller_for_pool(pool: bool) -> Controllers:
|
|
|
228
228
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
229
229
|
"""Check if the controller high availability is specified in user config.
|
|
230
230
|
"""
|
|
231
|
+
# pylint: disable=import-outside-toplevel
|
|
232
|
+
from sky.jobs import utils as managed_job_utils
|
|
233
|
+
if managed_job_utils.is_consolidation_mode():
|
|
234
|
+
return True
|
|
235
|
+
|
|
231
236
|
controller = Controllers.from_name(cluster_name)
|
|
232
237
|
if controller is None:
|
|
233
238
|
return False
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -7,12 +7,13 @@ import pathlib
|
|
|
7
7
|
import sqlite3
|
|
8
8
|
import threading
|
|
9
9
|
import typing
|
|
10
|
-
from typing import Any, Callable, Dict, Iterable, Optional
|
|
10
|
+
from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
|
|
11
11
|
|
|
12
12
|
import aiosqlite
|
|
13
13
|
import aiosqlite.context
|
|
14
14
|
import sqlalchemy
|
|
15
15
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
16
|
+
from sqlalchemy.ext import asyncio as sqlalchemy_async
|
|
16
17
|
|
|
17
18
|
from sky import sky_logging
|
|
18
19
|
from sky.skylet import constants
|
|
@@ -375,11 +376,34 @@ def get_max_connections():
|
|
|
375
376
|
return _max_connections
|
|
376
377
|
|
|
377
378
|
|
|
378
|
-
|
|
379
|
+
@typing.overload
|
|
380
|
+
def get_engine(
|
|
381
|
+
db_name: str,
|
|
382
|
+
async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
|
|
383
|
+
...
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
@typing.overload
|
|
387
|
+
def get_engine(db_name: str,
|
|
388
|
+
async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
|
|
389
|
+
...
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def get_engine(
|
|
393
|
+
db_name: str,
|
|
394
|
+
async_engine: bool = False
|
|
395
|
+
) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
|
|
379
396
|
conn_string = None
|
|
380
397
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
381
398
|
conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
382
399
|
if conn_string:
|
|
400
|
+
if async_engine:
|
|
401
|
+
conn_string = conn_string.replace('postgresql://',
|
|
402
|
+
'postgresql+asyncpg://')
|
|
403
|
+
# This is an AsyncEngine, instead of a (normal, synchronous) Engine,
|
|
404
|
+
# so we should not put it in the cache. Instead, just return.
|
|
405
|
+
return sqlalchemy_async.create_async_engine(
|
|
406
|
+
conn_string, poolclass=sqlalchemy.NullPool)
|
|
383
407
|
with _db_creation_lock:
|
|
384
408
|
if conn_string not in _postgres_engine_cache:
|
|
385
409
|
if _max_connections == 0:
|
|
@@ -401,6 +425,11 @@ def get_engine(db_name: str):
|
|
|
401
425
|
else:
|
|
402
426
|
db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
|
|
403
427
|
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
428
|
+
if async_engine:
|
|
429
|
+
# This is an AsyncEngine, instead of a (normal, synchronous) Engine,
|
|
430
|
+
# so we should not put it in the cache. Instead, just return.
|
|
431
|
+
return sqlalchemy_async.create_async_engine(
|
|
432
|
+
'sqlite+aiosqlite:///' + db_path, connect_args={'timeout': 30})
|
|
404
433
|
if db_path not in _sqlite_engine_cache:
|
|
405
434
|
_sqlite_engine_cache[db_path] = sqlalchemy.create_engine(
|
|
406
435
|
'sqlite:///' + db_path)
|
sky/utils/rich_utils.py
CHANGED
|
@@ -421,7 +421,7 @@ async def decode_rich_status_async(
|
|
|
421
421
|
undecoded_buffer = b''
|
|
422
422
|
|
|
423
423
|
# Iterate over the response content in chunks
|
|
424
|
-
async for chunk in response.content.
|
|
424
|
+
async for chunk, _ in response.content.iter_chunks():
|
|
425
425
|
if chunk is None:
|
|
426
426
|
return
|
|
427
427
|
|
|
@@ -481,6 +481,8 @@ async def decode_rich_status_async(
|
|
|
481
481
|
line = line[:-2] + '\n'
|
|
482
482
|
is_payload, line = message_utils.decode_payload(
|
|
483
483
|
line, raise_for_mismatch=False)
|
|
484
|
+
if line is None:
|
|
485
|
+
continue
|
|
484
486
|
control = None
|
|
485
487
|
if is_payload:
|
|
486
488
|
control, encoded_status = Control.decode(line)
|
sky/utils/subprocess_utils.py
CHANGED
|
@@ -437,3 +437,12 @@ def slow_start_processes(processes: List[Startable],
|
|
|
437
437
|
break
|
|
438
438
|
batch_size = min(batch_size * 2, max_batch_size)
|
|
439
439
|
time.sleep(delay)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def is_process_alive(pid: int) -> bool:
|
|
443
|
+
"""Check if a process is alive."""
|
|
444
|
+
try:
|
|
445
|
+
process = psutil.Process(pid)
|
|
446
|
+
return process.is_running()
|
|
447
|
+
except psutil.NoSuchProcess:
|
|
448
|
+
return False
|
sky/volumes/volume.py
CHANGED
|
@@ -150,6 +150,8 @@ class Volume:
|
|
|
150
150
|
self.region, self.zone = cloud_obj.validate_region_zone(
|
|
151
151
|
self.region, self.zone)
|
|
152
152
|
|
|
153
|
+
# Name must be set by factory before validation.
|
|
154
|
+
assert self.name is not None
|
|
153
155
|
valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
|
|
154
156
|
if not valid:
|
|
155
157
|
raise ValueError(f'Invalid volume name: {err_msg}')
|