skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +130 -40
- sky/backends/cloud_vm_ray_backend.py +19 -3
- sky/backends/wheel_utils.py +35 -8
- sky/clouds/aws.py +118 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +58 -10
- sky/jobs/server/server.py +2 -1
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/kubernetes/utils.py +9 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/serve/server/server.py +2 -1
- sky/server/common.py +1 -2
- sky/server/daemons.py +6 -0
- sky/server/requests/executor.py +3 -2
- sky/server/requests/payloads.py +3 -1
- sky/server/requests/preconditions.py +3 -2
- sky/server/requests/requests.py +110 -29
- sky/server/server.py +70 -61
- sky/server/stream_utils.py +7 -5
- sky/setup_files/dependencies.py +6 -1
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/db/db_utils.py +11 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/timeline.py +24 -93
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +56 -54
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
sky/server/daemons.py
CHANGED
|
@@ -7,8 +7,10 @@ from typing import Callable
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
|
+
from sky.utils import annotations
|
|
10
11
|
from sky.utils import common
|
|
11
12
|
from sky.utils import env_options
|
|
13
|
+
from sky.utils import timeline
|
|
12
14
|
from sky.utils import ux_utils
|
|
13
15
|
|
|
14
16
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -67,6 +69,10 @@ class InternalRequestDaemon:
|
|
|
67
69
|
sky_logging.reload_logger()
|
|
68
70
|
level = self.refresh_log_level()
|
|
69
71
|
self.event_fn()
|
|
72
|
+
# Clear request level cache after each run to avoid
|
|
73
|
+
# using too much memory.
|
|
74
|
+
annotations.clear_request_level_cache()
|
|
75
|
+
timeline.save_timeline()
|
|
70
76
|
except Exception: # pylint: disable=broad-except
|
|
71
77
|
# It is OK to fail to run the event, as the event is not
|
|
72
78
|
# critical, but we should log the error.
|
sky/server/requests/executor.py
CHANGED
|
@@ -383,7 +383,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
383
383
|
# config, as there can be some logs during override that needs to be
|
|
384
384
|
# captured in the log file.
|
|
385
385
|
try:
|
|
386
|
-
with
|
|
386
|
+
with sky_logging.add_debug_log_handler(request_id), \
|
|
387
|
+
override_request_env_and_config(request_body, request_id), \
|
|
387
388
|
tempstore.tempdir():
|
|
388
389
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
389
390
|
config = skypilot_config.to_dict()
|
|
@@ -452,7 +453,7 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
452
453
|
**request_body.to_kwargs())
|
|
453
454
|
|
|
454
455
|
async def poll_task(request_id: str) -> bool:
|
|
455
|
-
request = api_requests.
|
|
456
|
+
request = await api_requests.get_request_async(request_id)
|
|
456
457
|
if request is None:
|
|
457
458
|
raise RuntimeError('Request not found')
|
|
458
459
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -71,7 +71,9 @@ EXTERNAL_LOCAL_ENV_VARS = [
|
|
|
71
71
|
def request_body_env_vars() -> dict:
|
|
72
72
|
env_vars = {}
|
|
73
73
|
for env_var in os.environ:
|
|
74
|
-
if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX)
|
|
74
|
+
if (env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX) and
|
|
75
|
+
not env_var.startswith(
|
|
76
|
+
constants.SKYPILOT_SERVER_ENV_VAR_PREFIX)):
|
|
75
77
|
env_vars[env_var] = os.environ[env_var]
|
|
76
78
|
if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
|
|
77
79
|
env_vars[env_var] = os.environ[env_var]
|
|
@@ -98,7 +98,7 @@ class Precondition(abc.ABC):
|
|
|
98
98
|
return False
|
|
99
99
|
|
|
100
100
|
# Check if the request has been cancelled
|
|
101
|
-
request = api_requests.
|
|
101
|
+
request = await api_requests.get_request_async(self.request_id)
|
|
102
102
|
if request is None:
|
|
103
103
|
logger.error(f'Request {self.request_id} not found')
|
|
104
104
|
return False
|
|
@@ -112,7 +112,8 @@ class Precondition(abc.ABC):
|
|
|
112
112
|
return True
|
|
113
113
|
if status_msg is not None and status_msg != last_status_msg:
|
|
114
114
|
# Update the status message if it has changed.
|
|
115
|
-
with api_requests.
|
|
115
|
+
async with api_requests.update_request_async(
|
|
116
|
+
self.request_id) as req:
|
|
116
117
|
assert req is not None, self.request_id
|
|
117
118
|
req.status_msg = status_msg
|
|
118
119
|
last_status_msg = status_msg
|
sky/server/requests/requests.py
CHANGED
|
@@ -13,7 +13,8 @@ import sqlite3
|
|
|
13
13
|
import threading
|
|
14
14
|
import time
|
|
15
15
|
import traceback
|
|
16
|
-
from typing import Any, Callable, Dict, Generator, List,
|
|
16
|
+
from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
|
|
17
|
+
Optional, Tuple)
|
|
17
18
|
|
|
18
19
|
import colorama
|
|
19
20
|
import filelock
|
|
@@ -402,26 +403,46 @@ _DB = None
|
|
|
402
403
|
_init_db_lock = threading.Lock()
|
|
403
404
|
|
|
404
405
|
|
|
406
|
+
def _init_db_within_lock():
|
|
407
|
+
global _DB
|
|
408
|
+
if _DB is None:
|
|
409
|
+
db_path = os.path.expanduser(
|
|
410
|
+
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
411
|
+
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
412
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
413
|
+
|
|
414
|
+
|
|
405
415
|
def init_db(func):
|
|
406
416
|
"""Initialize the database."""
|
|
407
417
|
|
|
408
418
|
@functools.wraps(func)
|
|
409
419
|
def wrapper(*args, **kwargs):
|
|
410
|
-
global _DB
|
|
411
420
|
if _DB is not None:
|
|
412
421
|
return func(*args, **kwargs)
|
|
413
422
|
with _init_db_lock:
|
|
414
|
-
|
|
415
|
-
db_path = os.path.expanduser(
|
|
416
|
-
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
417
|
-
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
|
418
|
-
exist_ok=True)
|
|
419
|
-
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
423
|
+
_init_db_within_lock()
|
|
420
424
|
return func(*args, **kwargs)
|
|
421
425
|
|
|
422
426
|
return wrapper
|
|
423
427
|
|
|
424
428
|
|
|
429
|
+
def init_db_async(func):
|
|
430
|
+
"""Async version of init_db."""
|
|
431
|
+
|
|
432
|
+
@functools.wraps(func)
|
|
433
|
+
async def wrapper(*args, **kwargs):
|
|
434
|
+
if _DB is not None:
|
|
435
|
+
return await func(*args, **kwargs)
|
|
436
|
+
# If _DB is not initialized, init_db_async will be blocked if there
|
|
437
|
+
# is a thread initializing _DB, this is fine since it occurs on process
|
|
438
|
+
# startup.
|
|
439
|
+
with _init_db_lock:
|
|
440
|
+
_init_db_within_lock()
|
|
441
|
+
return await func(*args, **kwargs)
|
|
442
|
+
|
|
443
|
+
return wrapper
|
|
444
|
+
|
|
445
|
+
|
|
425
446
|
def reset_db_and_logs():
|
|
426
447
|
"""Create the database."""
|
|
427
448
|
server_common.clear_local_api_server_database()
|
|
@@ -440,28 +461,61 @@ def request_lock_path(request_id: str) -> str:
|
|
|
440
461
|
@contextlib.contextmanager
|
|
441
462
|
@init_db
|
|
442
463
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
443
|
-
"""Get a SkyPilot API request."""
|
|
464
|
+
"""Get and update a SkyPilot API request."""
|
|
444
465
|
request = _get_request_no_lock(request_id)
|
|
445
466
|
yield request
|
|
446
467
|
if request is not None:
|
|
447
468
|
_add_or_update_request_no_lock(request)
|
|
448
469
|
|
|
449
470
|
|
|
471
|
+
@init_db
|
|
472
|
+
def update_request_async(
|
|
473
|
+
request_id: str) -> AsyncContextManager[Optional[Request]]:
|
|
474
|
+
"""Async version of update_request.
|
|
475
|
+
|
|
476
|
+
Returns an async context manager that yields the request record and
|
|
477
|
+
persists any in-place updates upon exit.
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
@contextlib.asynccontextmanager
|
|
481
|
+
async def _cm():
|
|
482
|
+
request = await _get_request_no_lock_async(request_id)
|
|
483
|
+
try:
|
|
484
|
+
yield request
|
|
485
|
+
finally:
|
|
486
|
+
if request is not None:
|
|
487
|
+
await _add_or_update_request_no_lock_async(request)
|
|
488
|
+
|
|
489
|
+
return _cm()
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
_get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
|
|
493
|
+
'WHERE request_id LIKE ?')
|
|
494
|
+
|
|
495
|
+
|
|
450
496
|
def _get_request_no_lock(request_id: str) -> Optional[Request]:
|
|
451
497
|
"""Get a SkyPilot API request."""
|
|
452
498
|
assert _DB is not None
|
|
453
|
-
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
454
499
|
with _DB.conn:
|
|
455
500
|
cursor = _DB.conn.cursor()
|
|
456
|
-
cursor.execute(
|
|
457
|
-
f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
458
|
-
'WHERE request_id LIKE ?', (request_id + '%',))
|
|
501
|
+
cursor.execute(_get_request_sql, (request_id + '%',))
|
|
459
502
|
row = cursor.fetchone()
|
|
460
503
|
if row is None:
|
|
461
504
|
return None
|
|
462
505
|
return Request.from_row(row)
|
|
463
506
|
|
|
464
507
|
|
|
508
|
+
async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
|
|
509
|
+
"""Async version of _get_request_no_lock."""
|
|
510
|
+
assert _DB is not None
|
|
511
|
+
conn = await _DB.async_conn()
|
|
512
|
+
async with conn.execute(_get_request_sql, (request_id + '%',)) as cursor:
|
|
513
|
+
row = await cursor.fetchone()
|
|
514
|
+
if row is None:
|
|
515
|
+
return None
|
|
516
|
+
return Request.from_row(row)
|
|
517
|
+
|
|
518
|
+
|
|
465
519
|
@init_db
|
|
466
520
|
def get_latest_request_id() -> Optional[str]:
|
|
467
521
|
"""Get the latest request ID."""
|
|
@@ -481,6 +535,13 @@ def get_request(request_id: str) -> Optional[Request]:
|
|
|
481
535
|
return _get_request_no_lock(request_id)
|
|
482
536
|
|
|
483
537
|
|
|
538
|
+
@init_db_async
|
|
539
|
+
async def get_request_async(request_id: str) -> Optional[Request]:
|
|
540
|
+
"""Async version of get_request."""
|
|
541
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
542
|
+
return await _get_request_no_lock_async(request_id)
|
|
543
|
+
|
|
544
|
+
|
|
484
545
|
@init_db
|
|
485
546
|
def create_if_not_exists(request: Request) -> bool:
|
|
486
547
|
"""Create a SkyPilot API request if it does not exist."""
|
|
@@ -491,6 +552,16 @@ def create_if_not_exists(request: Request) -> bool:
|
|
|
491
552
|
return True
|
|
492
553
|
|
|
493
554
|
|
|
555
|
+
@init_db_async
|
|
556
|
+
async def create_if_not_exists_async(request: Request) -> bool:
|
|
557
|
+
"""Async version of create_if_not_exists."""
|
|
558
|
+
async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
|
|
559
|
+
if await _get_request_no_lock_async(request.request_id) is not None:
|
|
560
|
+
return False
|
|
561
|
+
await _add_or_update_request_no_lock_async(request)
|
|
562
|
+
return True
|
|
563
|
+
|
|
564
|
+
|
|
494
565
|
@init_db
|
|
495
566
|
def get_request_tasks(
|
|
496
567
|
status: Optional[List[RequestStatus]] = None,
|
|
@@ -565,16 +636,15 @@ def get_request_tasks(
|
|
|
565
636
|
return requests
|
|
566
637
|
|
|
567
638
|
|
|
568
|
-
@
|
|
569
|
-
def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
639
|
+
@init_db_async
|
|
640
|
+
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
570
641
|
"""Get a list of API request ids for shell completion."""
|
|
571
642
|
assert _DB is not None
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
643
|
+
conn = await _DB.async_conn()
|
|
644
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
645
|
+
# then order by creation time (newest first) within each category.
|
|
646
|
+
async with conn.execute(
|
|
647
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
578
648
|
WHERE request_id LIKE ?
|
|
579
649
|
ORDER BY
|
|
580
650
|
CASE
|
|
@@ -582,21 +652,32 @@ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
|
582
652
|
ELSE 1
|
|
583
653
|
END,
|
|
584
654
|
created_at DESC
|
|
585
|
-
LIMIT 1000""", (f'{incomplete}%',))
|
|
586
|
-
|
|
655
|
+
LIMIT 1000""", (f'{incomplete}%',)) as cursor:
|
|
656
|
+
rows = await cursor.fetchall()
|
|
657
|
+
if rows is None:
|
|
658
|
+
return []
|
|
659
|
+
return [row[0] for row in rows]
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
_add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
|
|
663
|
+
f'({", ".join(REQUEST_COLUMNS)}) VALUES '
|
|
664
|
+
f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
|
|
587
665
|
|
|
588
666
|
|
|
589
667
|
def _add_or_update_request_no_lock(request: Request):
|
|
590
668
|
"""Add or update a REST request into the database."""
|
|
591
|
-
row = request.to_row()
|
|
592
|
-
key_str = ', '.join(REQUEST_COLUMNS)
|
|
593
|
-
fill_str = ', '.join(['?'] * len(row))
|
|
594
669
|
assert _DB is not None
|
|
595
670
|
with _DB.conn:
|
|
596
671
|
cursor = _DB.conn.cursor()
|
|
597
|
-
cursor.execute(
|
|
598
|
-
|
|
599
|
-
|
|
672
|
+
cursor.execute(_add_or_update_request_sql, request.to_row())
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
async def _add_or_update_request_no_lock_async(request: Request):
|
|
676
|
+
"""Async version of _add_or_update_request_no_lock."""
|
|
677
|
+
assert _DB is not None
|
|
678
|
+
conn = await _DB.async_conn()
|
|
679
|
+
await conn.execute(_add_or_update_request_sql, request.to_row())
|
|
680
|
+
await conn.commit()
|
|
600
681
|
|
|
601
682
|
|
|
602
683
|
def set_request_failed(request_id: str, e: BaseException) -> None:
|
sky/server/server.py
CHANGED
|
@@ -21,6 +21,7 @@ import uuid
|
|
|
21
21
|
import zipfile
|
|
22
22
|
|
|
23
23
|
import aiofiles
|
|
24
|
+
import anyio
|
|
24
25
|
import fastapi
|
|
25
26
|
from fastapi.middleware import cors
|
|
26
27
|
import starlette.middleware.base
|
|
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
847
848
|
client_file_mounts_dir = (
|
|
848
849
|
common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
|
|
849
850
|
'file_mounts')
|
|
850
|
-
client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
|
|
851
|
+
await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
|
|
851
852
|
|
|
852
853
|
# Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
|
|
853
854
|
# characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
|
|
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
870
871
|
zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
|
|
871
872
|
else:
|
|
872
873
|
chunk_dir = client_file_mounts_dir / upload_id
|
|
873
|
-
chunk_dir.mkdir(parents=True, exist_ok=True)
|
|
874
|
+
await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
|
|
874
875
|
zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
|
|
875
876
|
|
|
876
877
|
try:
|
|
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
916
917
|
await zip_file.write(data)
|
|
917
918
|
|
|
918
919
|
logger.info(f'Uploaded zip file: {zip_file_path}')
|
|
919
|
-
unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
|
+
await unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
921
|
if total_chunks > 1:
|
|
921
|
-
shutil.rmtree
|
|
922
|
+
await context_utils.to_thread(shutil.rmtree, chunk_dir)
|
|
922
923
|
return payloads.UploadZipFileResponse(
|
|
923
924
|
status=responses.UploadStatus.COMPLETED.value)
|
|
924
925
|
|
|
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
|
|
|
933
934
|
return False
|
|
934
935
|
|
|
935
936
|
|
|
936
|
-
def unzip_file(zip_file_path: pathlib.Path,
|
|
937
|
-
|
|
938
|
-
"""Unzips a zip file."""
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
937
|
+
async def unzip_file(zip_file_path: pathlib.Path,
|
|
938
|
+
client_file_mounts_dir: pathlib.Path) -> None:
|
|
939
|
+
"""Unzips a zip file without blocking the event loop."""
|
|
940
|
+
|
|
941
|
+
def _do_unzip() -> None:
|
|
942
|
+
try:
|
|
943
|
+
with zipfile.ZipFile(zip_file_path, 'r') as zipf:
|
|
944
|
+
for member in zipf.infolist():
|
|
945
|
+
# Determine the new path
|
|
946
|
+
original_path = os.path.normpath(member.filename)
|
|
947
|
+
new_path = client_file_mounts_dir / original_path.lstrip(
|
|
948
|
+
'/')
|
|
949
|
+
|
|
950
|
+
if (member.external_attr >> 28) == 0xA:
|
|
951
|
+
# Symlink. Read the target path and create a symlink.
|
|
952
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
953
|
+
target = zipf.read(member).decode()
|
|
954
|
+
assert not os.path.isabs(target), target
|
|
955
|
+
# Since target is a relative path, we need to check that
|
|
956
|
+
# it is under `client_file_mounts_dir` for security.
|
|
957
|
+
full_target_path = (new_path.parent / target).resolve()
|
|
958
|
+
if not _is_relative_to(full_target_path,
|
|
959
|
+
client_file_mounts_dir):
|
|
960
|
+
raise ValueError(
|
|
961
|
+
f'Symlink target {target} leads to a '
|
|
962
|
+
'file not in userspace. Aborted.')
|
|
963
|
+
|
|
964
|
+
if new_path.exists() or new_path.is_symlink():
|
|
965
|
+
new_path.unlink(missing_ok=True)
|
|
966
|
+
new_path.symlink_to(
|
|
967
|
+
target,
|
|
968
|
+
target_is_directory=member.filename.endswith('/'))
|
|
969
|
+
continue
|
|
970
|
+
|
|
971
|
+
# Handle directories
|
|
972
|
+
if member.filename.endswith('/'):
|
|
973
|
+
new_path.mkdir(parents=True, exist_ok=True)
|
|
974
|
+
continue
|
|
975
|
+
|
|
976
|
+
# Handle files
|
|
948
977
|
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
continue
|
|
970
|
-
|
|
971
|
-
# Handle files
|
|
972
|
-
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
973
|
-
with zipf.open(member) as member_file, new_path.open('wb') as f:
|
|
974
|
-
# Use shutil.copyfileobj to copy files in chunks, so it does
|
|
975
|
-
# not load the entire file into memory.
|
|
976
|
-
shutil.copyfileobj(member_file, f)
|
|
977
|
-
except zipfile.BadZipFile as e:
|
|
978
|
-
logger.error(f'Bad zip file: {zip_file_path}')
|
|
979
|
-
raise fastapi.HTTPException(
|
|
980
|
-
status_code=400,
|
|
981
|
-
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
982
|
-
except Exception as e:
|
|
983
|
-
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
984
|
-
raise fastapi.HTTPException(
|
|
985
|
-
status_code=500,
|
|
986
|
-
detail=(f'Error unzipping file: '
|
|
987
|
-
f'{common_utils.format_exception(e)}'))
|
|
978
|
+
with zipf.open(member) as member_file, new_path.open(
|
|
979
|
+
'wb') as f:
|
|
980
|
+
# Use shutil.copyfileobj to copy files in chunks,
|
|
981
|
+
# so it does not load the entire file into memory.
|
|
982
|
+
shutil.copyfileobj(member_file, f)
|
|
983
|
+
except zipfile.BadZipFile as e:
|
|
984
|
+
logger.error(f'Bad zip file: {zip_file_path}')
|
|
985
|
+
raise fastapi.HTTPException(
|
|
986
|
+
status_code=400,
|
|
987
|
+
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
988
|
+
except Exception as e:
|
|
989
|
+
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
990
|
+
raise fastapi.HTTPException(
|
|
991
|
+
status_code=500,
|
|
992
|
+
detail=(f'Error unzipping file: '
|
|
993
|
+
f'{common_utils.format_exception(e)}'))
|
|
994
|
+
finally:
|
|
995
|
+
# Cleanup the temporary file regardless of
|
|
996
|
+
# success/failure handling above
|
|
997
|
+
zip_file_path.unlink(missing_ok=True)
|
|
988
998
|
|
|
989
|
-
|
|
990
|
-
zip_file_path.unlink()
|
|
999
|
+
await context_utils.to_thread(_do_unzip)
|
|
991
1000
|
|
|
992
1001
|
|
|
993
1002
|
@app.post('/launch')
|
|
@@ -1388,7 +1397,7 @@ async def local_down(request: fastapi.Request) -> None:
|
|
|
1388
1397
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1389
1398
|
"""Gets a request with a given request ID prefix."""
|
|
1390
1399
|
while True:
|
|
1391
|
-
request_task = requests_lib.
|
|
1400
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1392
1401
|
if request_task is None:
|
|
1393
1402
|
print(f'No task with request ID {request_id}', flush=True)
|
|
1394
1403
|
raise fastapi.HTTPException(
|
|
@@ -1477,7 +1486,7 @@ async def stream(
|
|
|
1477
1486
|
|
|
1478
1487
|
# Original plain text streaming logic
|
|
1479
1488
|
if request_id is not None:
|
|
1480
|
-
request_task = requests_lib.
|
|
1489
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1481
1490
|
if request_task is None:
|
|
1482
1491
|
print(f'No task with request ID {request_id}')
|
|
1483
1492
|
raise fastapi.HTTPException(
|
|
@@ -1572,7 +1581,7 @@ async def api_status(
|
|
|
1572
1581
|
else:
|
|
1573
1582
|
encoded_request_tasks = []
|
|
1574
1583
|
for request_id in request_ids:
|
|
1575
|
-
request_task = requests_lib.
|
|
1584
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1576
1585
|
if request_task is None:
|
|
1577
1586
|
continue
|
|
1578
1587
|
encoded_request_tasks.append(request_task.readable_encode())
|
|
@@ -1782,7 +1791,7 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
|
|
|
1782
1791
|
|
|
1783
1792
|
@app.get('/api/completion/api_request')
|
|
1784
1793
|
async def complete_api_request(incomplete: str,) -> List[str]:
|
|
1785
|
-
return requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1794
|
+
return await requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1786
1795
|
|
|
1787
1796
|
|
|
1788
1797
|
@app.get('/dashboard/{full_path:path}')
|
sky/server/stream_utils.py
CHANGED
|
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
|
|
|
56
56
|
if request_id is not None:
|
|
57
57
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
58
58
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
59
|
-
request_task = requests_lib.
|
|
59
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
60
60
|
|
|
61
61
|
if request_task is None:
|
|
62
62
|
raise fastapi.HTTPException(
|
|
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
|
|
|
86
86
|
# Use smaller padding (1024 bytes) to force browser rendering
|
|
87
87
|
yield f'{waiting_msg}' + ' ' * 4096 + '\n'
|
|
88
88
|
# Sleep shortly to avoid storming the DB and CPU and allow other
|
|
89
|
-
# coroutines to run.
|
|
90
|
-
#
|
|
89
|
+
# coroutines to run.
|
|
90
|
+
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
91
|
+
# polling the DB, which can be a bottleneck for high-concurrency
|
|
92
|
+
# requests.
|
|
91
93
|
await asyncio.sleep(0.1)
|
|
92
|
-
request_task = requests_lib.
|
|
94
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
93
95
|
if not follow:
|
|
94
96
|
break
|
|
95
97
|
if show_request_waiting_spinner:
|
|
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
151
153
|
line: Optional[bytes] = await f.readline()
|
|
152
154
|
if not line:
|
|
153
155
|
if request_id is not None:
|
|
154
|
-
request_task = requests_lib.
|
|
156
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
155
157
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
|
156
158
|
if (request_task.status ==
|
|
157
159
|
requests_lib.RequestStatus.CANCELLED):
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -35,7 +35,8 @@ install_requires = [
|
|
|
35
35
|
# Light weight requirement, can be replaced with "typing" once
|
|
36
36
|
# we deprecate Python 3.7 (this will take a while).
|
|
37
37
|
'typing_extensions',
|
|
38
|
-
|
|
38
|
+
# filelock 3.15.0 or higher is required for async file locking.
|
|
39
|
+
'filelock >= 3.15.0',
|
|
39
40
|
'packaging',
|
|
40
41
|
'psutil',
|
|
41
42
|
'pulp',
|
|
@@ -71,6 +72,8 @@ install_requires = [
|
|
|
71
72
|
'types-paramiko',
|
|
72
73
|
'alembic',
|
|
73
74
|
'aiohttp',
|
|
75
|
+
'aiosqlite',
|
|
76
|
+
'anyio',
|
|
74
77
|
]
|
|
75
78
|
|
|
76
79
|
# See requirements-dev.txt for the version of grpc and protobuf
|
|
@@ -92,8 +95,10 @@ server_dependencies = [
|
|
|
92
95
|
'passlib',
|
|
93
96
|
'pyjwt',
|
|
94
97
|
'aiohttp',
|
|
98
|
+
'anyio',
|
|
95
99
|
GRPC,
|
|
96
100
|
PROTOBUF,
|
|
101
|
+
'aiosqlite',
|
|
97
102
|
]
|
|
98
103
|
|
|
99
104
|
local_ray = [
|
sky/sky_logging.py
CHANGED
|
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
|
|
19
19
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
20
20
|
_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
|
|
21
21
|
|
|
22
|
+
_DEBUG_LOG_DIR = os.path.expanduser(
|
|
23
|
+
os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
|
|
24
|
+
|
|
22
25
|
DEBUG = logging.DEBUG
|
|
23
26
|
INFO = logging.INFO
|
|
24
27
|
WARNING = logging.WARNING
|
|
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
|
|
|
254
257
|
log_path = os.path.expanduser(os.path.join(log_dir, file_name))
|
|
255
258
|
|
|
256
259
|
return log_path
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@contextlib.contextmanager
|
|
263
|
+
def add_debug_log_handler(request_id: str):
|
|
264
|
+
if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
|
|
265
|
+
yield
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
|
|
269
|
+
log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
|
|
270
|
+
try:
|
|
271
|
+
debug_log_handler = logging.FileHandler(log_path)
|
|
272
|
+
debug_log_handler.setFormatter(FORMATTER)
|
|
273
|
+
debug_log_handler.setLevel(logging.DEBUG)
|
|
274
|
+
_root_logger.addHandler(debug_log_handler)
|
|
275
|
+
# sky.provision sets up its own logger/handler with propogate=False,
|
|
276
|
+
# so add it there too.
|
|
277
|
+
provision_logger = logging.getLogger('sky.provision')
|
|
278
|
+
provision_logger.addHandler(debug_log_handler)
|
|
279
|
+
provision_logger.setLevel(logging.DEBUG)
|
|
280
|
+
yield
|
|
281
|
+
finally:
|
|
282
|
+
_root_logger.removeHandler(debug_log_handler)
|
|
283
|
+
provision_logger.removeHandler(debug_log_handler)
|
|
284
|
+
debug_log_handler.close()
|
sky/skylet/constants.py
CHANGED
|
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
|
70
70
|
|
|
71
71
|
# Prefix for SkyPilot environment variables
|
|
72
72
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
73
|
+
SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
|
|
73
74
|
|
|
74
75
|
# The name for the environment variable that stores the unique ID of the
|
|
75
76
|
# current task. This will stay the same across multiple recoveries of the
|
|
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
|
|
417
418
|
# Path to the generated cluster config yamls and ssh configs.
|
|
418
419
|
SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
419
420
|
|
|
421
|
+
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
420
422
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
421
423
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
422
424
|
|
|
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
|
436
438
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
437
439
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
438
440
|
|
|
441
|
+
# Enable debug logging for requests.
|
|
442
|
+
ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
|
|
443
|
+
f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
|
|
444
|
+
|
|
439
445
|
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
440
446
|
|
|
441
447
|
# BEGIN constants used for service catalog.
|
sky/templates/aws-ray.yml.j2
CHANGED
sky/utils/annotations.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing_extensions import ParamSpec
|
|
|
7
7
|
|
|
8
8
|
# Whether the current process is a SkyPilot API server process.
|
|
9
9
|
is_on_api_server = True
|
|
10
|
-
|
|
10
|
+
_FUNCTIONS_NEED_RELOAD_CACHE = []
|
|
11
11
|
|
|
12
12
|
T = TypeVar('T')
|
|
13
13
|
P = ParamSpec('P')
|
|
@@ -50,7 +50,13 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
|
|
|
50
50
|
else:
|
|
51
51
|
cached_func = functools.lru_cache(*lru_cache_args,
|
|
52
52
|
**lru_cache_kwargs)(func)
|
|
53
|
-
|
|
53
|
+
_FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
|
|
54
54
|
return cached_func
|
|
55
55
|
|
|
56
56
|
return decorator
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def clear_request_level_cache():
|
|
60
|
+
"""Clear the request-level cache."""
|
|
61
|
+
for func in _FUNCTIONS_NEED_RELOAD_CACHE:
|
|
62
|
+
func.cache_clear()
|