skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +1 -1
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +140 -52
- sky/backends/cloud_vm_ray_backend.py +30 -25
- sky/backends/local_docker_backend.py +3 -8
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +41 -9
- sky/client/sdk.py +23 -8
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +82 -22
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/jobs/server/server.py +2 -1
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/instance.py +55 -11
- sky/provision/kubernetes/utils.py +11 -2
- sky/provision/nebius/utils.py +36 -2
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +2 -2
- sky/serve/server/impl.py +3 -2
- sky/serve/server/server.py +2 -1
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +4 -4
- sky/server/daemons.py +16 -5
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +3 -1
- sky/server/requests/preconditions.py +3 -2
- sky/server/requests/requests.py +121 -19
- sky/server/server.py +85 -60
- sky/server/stream_utils.py +7 -5
- sky/setup_files/dependencies.py +6 -1
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/skylet/events.py +2 -3
- sky/skypilot_config.py +10 -10
- sky/task.py +1 -1
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/common_utils.py +0 -72
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/db/db_utils.py +11 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +3 -0
- sky/utils/timeline.py +24 -93
- sky/utils/yaml_utils.py +77 -10
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
sky/server/requests/requests.py
CHANGED
|
@@ -13,7 +13,8 @@ import sqlite3
|
|
|
13
13
|
import threading
|
|
14
14
|
import time
|
|
15
15
|
import traceback
|
|
16
|
-
from typing import Any, Callable, Dict, Generator, List,
|
|
16
|
+
from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
|
|
17
|
+
Optional, Tuple)
|
|
17
18
|
|
|
18
19
|
import colorama
|
|
19
20
|
import filelock
|
|
@@ -402,26 +403,46 @@ _DB = None
|
|
|
402
403
|
_init_db_lock = threading.Lock()
|
|
403
404
|
|
|
404
405
|
|
|
406
|
+
def _init_db_within_lock():
|
|
407
|
+
global _DB
|
|
408
|
+
if _DB is None:
|
|
409
|
+
db_path = os.path.expanduser(
|
|
410
|
+
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
411
|
+
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
412
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
413
|
+
|
|
414
|
+
|
|
405
415
|
def init_db(func):
|
|
406
416
|
"""Initialize the database."""
|
|
407
417
|
|
|
408
418
|
@functools.wraps(func)
|
|
409
419
|
def wrapper(*args, **kwargs):
|
|
410
|
-
global _DB
|
|
411
420
|
if _DB is not None:
|
|
412
421
|
return func(*args, **kwargs)
|
|
413
422
|
with _init_db_lock:
|
|
414
|
-
|
|
415
|
-
db_path = os.path.expanduser(
|
|
416
|
-
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
417
|
-
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
|
418
|
-
exist_ok=True)
|
|
419
|
-
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
423
|
+
_init_db_within_lock()
|
|
420
424
|
return func(*args, **kwargs)
|
|
421
425
|
|
|
422
426
|
return wrapper
|
|
423
427
|
|
|
424
428
|
|
|
429
|
+
def init_db_async(func):
|
|
430
|
+
"""Async version of init_db."""
|
|
431
|
+
|
|
432
|
+
@functools.wraps(func)
|
|
433
|
+
async def wrapper(*args, **kwargs):
|
|
434
|
+
if _DB is not None:
|
|
435
|
+
return await func(*args, **kwargs)
|
|
436
|
+
# If _DB is not initialized, init_db_async will be blocked if there
|
|
437
|
+
# is a thread initializing _DB, this is fine since it occurs on process
|
|
438
|
+
# startup.
|
|
439
|
+
with _init_db_lock:
|
|
440
|
+
_init_db_within_lock()
|
|
441
|
+
return await func(*args, **kwargs)
|
|
442
|
+
|
|
443
|
+
return wrapper
|
|
444
|
+
|
|
445
|
+
|
|
425
446
|
def reset_db_and_logs():
|
|
426
447
|
"""Create the database."""
|
|
427
448
|
server_common.clear_local_api_server_database()
|
|
@@ -440,28 +461,61 @@ def request_lock_path(request_id: str) -> str:
|
|
|
440
461
|
@contextlib.contextmanager
|
|
441
462
|
@init_db
|
|
442
463
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
443
|
-
"""Get a SkyPilot API request."""
|
|
464
|
+
"""Get and update a SkyPilot API request."""
|
|
444
465
|
request = _get_request_no_lock(request_id)
|
|
445
466
|
yield request
|
|
446
467
|
if request is not None:
|
|
447
468
|
_add_or_update_request_no_lock(request)
|
|
448
469
|
|
|
449
470
|
|
|
471
|
+
@init_db
|
|
472
|
+
def update_request_async(
|
|
473
|
+
request_id: str) -> AsyncContextManager[Optional[Request]]:
|
|
474
|
+
"""Async version of update_request.
|
|
475
|
+
|
|
476
|
+
Returns an async context manager that yields the request record and
|
|
477
|
+
persists any in-place updates upon exit.
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
@contextlib.asynccontextmanager
|
|
481
|
+
async def _cm():
|
|
482
|
+
request = await _get_request_no_lock_async(request_id)
|
|
483
|
+
try:
|
|
484
|
+
yield request
|
|
485
|
+
finally:
|
|
486
|
+
if request is not None:
|
|
487
|
+
await _add_or_update_request_no_lock_async(request)
|
|
488
|
+
|
|
489
|
+
return _cm()
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
_get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
|
|
493
|
+
'WHERE request_id LIKE ?')
|
|
494
|
+
|
|
495
|
+
|
|
450
496
|
def _get_request_no_lock(request_id: str) -> Optional[Request]:
|
|
451
497
|
"""Get a SkyPilot API request."""
|
|
452
498
|
assert _DB is not None
|
|
453
|
-
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
454
499
|
with _DB.conn:
|
|
455
500
|
cursor = _DB.conn.cursor()
|
|
456
|
-
cursor.execute(
|
|
457
|
-
f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
458
|
-
'WHERE request_id LIKE ?', (request_id + '%',))
|
|
501
|
+
cursor.execute(_get_request_sql, (request_id + '%',))
|
|
459
502
|
row = cursor.fetchone()
|
|
460
503
|
if row is None:
|
|
461
504
|
return None
|
|
462
505
|
return Request.from_row(row)
|
|
463
506
|
|
|
464
507
|
|
|
508
|
+
async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
|
|
509
|
+
"""Async version of _get_request_no_lock."""
|
|
510
|
+
assert _DB is not None
|
|
511
|
+
conn = await _DB.async_conn()
|
|
512
|
+
async with conn.execute(_get_request_sql, (request_id + '%',)) as cursor:
|
|
513
|
+
row = await cursor.fetchone()
|
|
514
|
+
if row is None:
|
|
515
|
+
return None
|
|
516
|
+
return Request.from_row(row)
|
|
517
|
+
|
|
518
|
+
|
|
465
519
|
@init_db
|
|
466
520
|
def get_latest_request_id() -> Optional[str]:
|
|
467
521
|
"""Get the latest request ID."""
|
|
@@ -481,6 +535,13 @@ def get_request(request_id: str) -> Optional[Request]:
|
|
|
481
535
|
return _get_request_no_lock(request_id)
|
|
482
536
|
|
|
483
537
|
|
|
538
|
+
@init_db_async
|
|
539
|
+
async def get_request_async(request_id: str) -> Optional[Request]:
|
|
540
|
+
"""Async version of get_request."""
|
|
541
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
542
|
+
return await _get_request_no_lock_async(request_id)
|
|
543
|
+
|
|
544
|
+
|
|
484
545
|
@init_db
|
|
485
546
|
def create_if_not_exists(request: Request) -> bool:
|
|
486
547
|
"""Create a SkyPilot API request if it does not exist."""
|
|
@@ -491,6 +552,16 @@ def create_if_not_exists(request: Request) -> bool:
|
|
|
491
552
|
return True
|
|
492
553
|
|
|
493
554
|
|
|
555
|
+
@init_db_async
|
|
556
|
+
async def create_if_not_exists_async(request: Request) -> bool:
|
|
557
|
+
"""Async version of create_if_not_exists."""
|
|
558
|
+
async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
|
|
559
|
+
if await _get_request_no_lock_async(request.request_id) is not None:
|
|
560
|
+
return False
|
|
561
|
+
await _add_or_update_request_no_lock_async(request)
|
|
562
|
+
return True
|
|
563
|
+
|
|
564
|
+
|
|
494
565
|
@init_db
|
|
495
566
|
def get_request_tasks(
|
|
496
567
|
status: Optional[List[RequestStatus]] = None,
|
|
@@ -565,17 +636,48 @@ def get_request_tasks(
|
|
|
565
636
|
return requests
|
|
566
637
|
|
|
567
638
|
|
|
639
|
+
@init_db_async
|
|
640
|
+
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
641
|
+
"""Get a list of API request ids for shell completion."""
|
|
642
|
+
assert _DB is not None
|
|
643
|
+
conn = await _DB.async_conn()
|
|
644
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
645
|
+
# then order by creation time (newest first) within each category.
|
|
646
|
+
async with conn.execute(
|
|
647
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
648
|
+
WHERE request_id LIKE ?
|
|
649
|
+
ORDER BY
|
|
650
|
+
CASE
|
|
651
|
+
WHEN status IN ('PENDING', 'RUNNING') THEN 0
|
|
652
|
+
ELSE 1
|
|
653
|
+
END,
|
|
654
|
+
created_at DESC
|
|
655
|
+
LIMIT 1000""", (f'{incomplete}%',)) as cursor:
|
|
656
|
+
rows = await cursor.fetchall()
|
|
657
|
+
if rows is None:
|
|
658
|
+
return []
|
|
659
|
+
return [row[0] for row in rows]
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
_add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
|
|
663
|
+
f'({", ".join(REQUEST_COLUMNS)}) VALUES '
|
|
664
|
+
f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
|
|
665
|
+
|
|
666
|
+
|
|
568
667
|
def _add_or_update_request_no_lock(request: Request):
|
|
569
668
|
"""Add or update a REST request into the database."""
|
|
570
|
-
row = request.to_row()
|
|
571
|
-
key_str = ', '.join(REQUEST_COLUMNS)
|
|
572
|
-
fill_str = ', '.join(['?'] * len(row))
|
|
573
669
|
assert _DB is not None
|
|
574
670
|
with _DB.conn:
|
|
575
671
|
cursor = _DB.conn.cursor()
|
|
576
|
-
cursor.execute(
|
|
577
|
-
|
|
578
|
-
|
|
672
|
+
cursor.execute(_add_or_update_request_sql, request.to_row())
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
async def _add_or_update_request_no_lock_async(request: Request):
|
|
676
|
+
"""Async version of _add_or_update_request_no_lock."""
|
|
677
|
+
assert _DB is not None
|
|
678
|
+
conn = await _DB.async_conn()
|
|
679
|
+
await conn.execute(_add_or_update_request_sql, request.to_row())
|
|
680
|
+
await conn.commit()
|
|
579
681
|
|
|
580
682
|
|
|
581
683
|
def set_request_failed(request_id: str, e: BaseException) -> None:
|
sky/server/server.py
CHANGED
|
@@ -21,6 +21,7 @@ import uuid
|
|
|
21
21
|
import zipfile
|
|
22
22
|
|
|
23
23
|
import aiofiles
|
|
24
|
+
import anyio
|
|
24
25
|
import fastapi
|
|
25
26
|
from fastapi.middleware import cors
|
|
26
27
|
import starlette.middleware.base
|
|
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
847
848
|
client_file_mounts_dir = (
|
|
848
849
|
common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
|
|
849
850
|
'file_mounts')
|
|
850
|
-
client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
|
|
851
|
+
await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
|
|
851
852
|
|
|
852
853
|
# Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
|
|
853
854
|
# characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
|
|
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
870
871
|
zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
|
|
871
872
|
else:
|
|
872
873
|
chunk_dir = client_file_mounts_dir / upload_id
|
|
873
|
-
chunk_dir.mkdir(parents=True, exist_ok=True)
|
|
874
|
+
await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
|
|
874
875
|
zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
|
|
875
876
|
|
|
876
877
|
try:
|
|
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
916
917
|
await zip_file.write(data)
|
|
917
918
|
|
|
918
919
|
logger.info(f'Uploaded zip file: {zip_file_path}')
|
|
919
|
-
unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
|
+
await unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
921
|
if total_chunks > 1:
|
|
921
|
-
shutil.rmtree
|
|
922
|
+
await context_utils.to_thread(shutil.rmtree, chunk_dir)
|
|
922
923
|
return payloads.UploadZipFileResponse(
|
|
923
924
|
status=responses.UploadStatus.COMPLETED.value)
|
|
924
925
|
|
|
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
|
|
|
933
934
|
return False
|
|
934
935
|
|
|
935
936
|
|
|
936
|
-
def unzip_file(zip_file_path: pathlib.Path,
|
|
937
|
-
|
|
938
|
-
"""Unzips a zip file."""
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
937
|
+
async def unzip_file(zip_file_path: pathlib.Path,
|
|
938
|
+
client_file_mounts_dir: pathlib.Path) -> None:
|
|
939
|
+
"""Unzips a zip file without blocking the event loop."""
|
|
940
|
+
|
|
941
|
+
def _do_unzip() -> None:
|
|
942
|
+
try:
|
|
943
|
+
with zipfile.ZipFile(zip_file_path, 'r') as zipf:
|
|
944
|
+
for member in zipf.infolist():
|
|
945
|
+
# Determine the new path
|
|
946
|
+
original_path = os.path.normpath(member.filename)
|
|
947
|
+
new_path = client_file_mounts_dir / original_path.lstrip(
|
|
948
|
+
'/')
|
|
949
|
+
|
|
950
|
+
if (member.external_attr >> 28) == 0xA:
|
|
951
|
+
# Symlink. Read the target path and create a symlink.
|
|
952
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
953
|
+
target = zipf.read(member).decode()
|
|
954
|
+
assert not os.path.isabs(target), target
|
|
955
|
+
# Since target is a relative path, we need to check that
|
|
956
|
+
# it is under `client_file_mounts_dir` for security.
|
|
957
|
+
full_target_path = (new_path.parent / target).resolve()
|
|
958
|
+
if not _is_relative_to(full_target_path,
|
|
959
|
+
client_file_mounts_dir):
|
|
960
|
+
raise ValueError(
|
|
961
|
+
f'Symlink target {target} leads to a '
|
|
962
|
+
'file not in userspace. Aborted.')
|
|
963
|
+
|
|
964
|
+
if new_path.exists() or new_path.is_symlink():
|
|
965
|
+
new_path.unlink(missing_ok=True)
|
|
966
|
+
new_path.symlink_to(
|
|
967
|
+
target,
|
|
968
|
+
target_is_directory=member.filename.endswith('/'))
|
|
969
|
+
continue
|
|
970
|
+
|
|
971
|
+
# Handle directories
|
|
972
|
+
if member.filename.endswith('/'):
|
|
973
|
+
new_path.mkdir(parents=True, exist_ok=True)
|
|
974
|
+
continue
|
|
975
|
+
|
|
976
|
+
# Handle files
|
|
948
977
|
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
continue
|
|
970
|
-
|
|
971
|
-
# Handle files
|
|
972
|
-
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
973
|
-
with zipf.open(member) as member_file, new_path.open('wb') as f:
|
|
974
|
-
# Use shutil.copyfileobj to copy files in chunks, so it does
|
|
975
|
-
# not load the entire file into memory.
|
|
976
|
-
shutil.copyfileobj(member_file, f)
|
|
977
|
-
except zipfile.BadZipFile as e:
|
|
978
|
-
logger.error(f'Bad zip file: {zip_file_path}')
|
|
979
|
-
raise fastapi.HTTPException(
|
|
980
|
-
status_code=400,
|
|
981
|
-
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
982
|
-
except Exception as e:
|
|
983
|
-
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
984
|
-
raise fastapi.HTTPException(
|
|
985
|
-
status_code=500,
|
|
986
|
-
detail=(f'Error unzipping file: '
|
|
987
|
-
f'{common_utils.format_exception(e)}'))
|
|
978
|
+
with zipf.open(member) as member_file, new_path.open(
|
|
979
|
+
'wb') as f:
|
|
980
|
+
# Use shutil.copyfileobj to copy files in chunks,
|
|
981
|
+
# so it does not load the entire file into memory.
|
|
982
|
+
shutil.copyfileobj(member_file, f)
|
|
983
|
+
except zipfile.BadZipFile as e:
|
|
984
|
+
logger.error(f'Bad zip file: {zip_file_path}')
|
|
985
|
+
raise fastapi.HTTPException(
|
|
986
|
+
status_code=400,
|
|
987
|
+
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
988
|
+
except Exception as e:
|
|
989
|
+
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
990
|
+
raise fastapi.HTTPException(
|
|
991
|
+
status_code=500,
|
|
992
|
+
detail=(f'Error unzipping file: '
|
|
993
|
+
f'{common_utils.format_exception(e)}'))
|
|
994
|
+
finally:
|
|
995
|
+
# Cleanup the temporary file regardless of
|
|
996
|
+
# success/failure handling above
|
|
997
|
+
zip_file_path.unlink(missing_ok=True)
|
|
988
998
|
|
|
989
|
-
|
|
990
|
-
zip_file_path.unlink()
|
|
999
|
+
await context_utils.to_thread(_do_unzip)
|
|
991
1000
|
|
|
992
1001
|
|
|
993
1002
|
@app.post('/launch')
|
|
@@ -1388,7 +1397,7 @@ async def local_down(request: fastapi.Request) -> None:
|
|
|
1388
1397
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1389
1398
|
"""Gets a request with a given request ID prefix."""
|
|
1390
1399
|
while True:
|
|
1391
|
-
request_task = requests_lib.
|
|
1400
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1392
1401
|
if request_task is None:
|
|
1393
1402
|
print(f'No task with request ID {request_id}', flush=True)
|
|
1394
1403
|
raise fastapi.HTTPException(
|
|
@@ -1403,6 +1412,9 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1403
1412
|
raise fastapi.HTTPException(
|
|
1404
1413
|
status_code=500, detail=request_task.encode().model_dump())
|
|
1405
1414
|
return request_task.encode()
|
|
1415
|
+
elif (request_task.status == requests_lib.RequestStatus.RUNNING and
|
|
1416
|
+
daemons.is_daemon_request_id(request_id)):
|
|
1417
|
+
return request_task.encode()
|
|
1406
1418
|
# yield control to allow other coroutines to run, sleep shortly
|
|
1407
1419
|
# to avoid storming the DB and CPU in the meantime
|
|
1408
1420
|
await asyncio.sleep(0.1)
|
|
@@ -1474,7 +1486,7 @@ async def stream(
|
|
|
1474
1486
|
|
|
1475
1487
|
# Original plain text streaming logic
|
|
1476
1488
|
if request_id is not None:
|
|
1477
|
-
request_task = requests_lib.
|
|
1489
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1478
1490
|
if request_task is None:
|
|
1479
1491
|
print(f'No task with request ID {request_id}')
|
|
1480
1492
|
raise fastapi.HTTPException(
|
|
@@ -1491,6 +1503,14 @@ async def stream(
|
|
|
1491
1503
|
if log_path == constants.API_SERVER_LOGS:
|
|
1492
1504
|
resolved_log_path = pathlib.Path(
|
|
1493
1505
|
constants.API_SERVER_LOGS).expanduser()
|
|
1506
|
+
if not resolved_log_path.exists():
|
|
1507
|
+
raise fastapi.HTTPException(
|
|
1508
|
+
status_code=404,
|
|
1509
|
+
detail='Server log file does not exist. The API server may '
|
|
1510
|
+
'have been started with `--foreground` - check the '
|
|
1511
|
+
'stdout of API server process, such as: '
|
|
1512
|
+
'`kubectl logs -n api-server-namespace '
|
|
1513
|
+
'api-server-pod-name`')
|
|
1494
1514
|
else:
|
|
1495
1515
|
# This should be a log path under ~/sky_logs.
|
|
1496
1516
|
resolved_logs_directory = pathlib.Path(
|
|
@@ -1561,7 +1581,7 @@ async def api_status(
|
|
|
1561
1581
|
else:
|
|
1562
1582
|
encoded_request_tasks = []
|
|
1563
1583
|
for request_id in request_ids:
|
|
1564
|
-
request_task = requests_lib.
|
|
1584
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1565
1585
|
if request_task is None:
|
|
1566
1586
|
continue
|
|
1567
1587
|
encoded_request_tasks.append(request_task.readable_encode())
|
|
@@ -1769,6 +1789,11 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
|
|
|
1769
1789
|
return global_user_state.get_volume_names_start_with(incomplete)
|
|
1770
1790
|
|
|
1771
1791
|
|
|
1792
|
+
@app.get('/api/completion/api_request')
|
|
1793
|
+
async def complete_api_request(incomplete: str,) -> List[str]:
|
|
1794
|
+
return await requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1795
|
+
|
|
1796
|
+
|
|
1772
1797
|
@app.get('/dashboard/{full_path:path}')
|
|
1773
1798
|
async def serve_dashboard(full_path: str):
|
|
1774
1799
|
"""Serves the Next.js dashboard application.
|
sky/server/stream_utils.py
CHANGED
|
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
|
|
|
56
56
|
if request_id is not None:
|
|
57
57
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
58
58
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
59
|
-
request_task = requests_lib.
|
|
59
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
60
60
|
|
|
61
61
|
if request_task is None:
|
|
62
62
|
raise fastapi.HTTPException(
|
|
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
|
|
|
86
86
|
# Use smaller padding (1024 bytes) to force browser rendering
|
|
87
87
|
yield f'{waiting_msg}' + ' ' * 4096 + '\n'
|
|
88
88
|
# Sleep shortly to avoid storming the DB and CPU and allow other
|
|
89
|
-
# coroutines to run.
|
|
90
|
-
#
|
|
89
|
+
# coroutines to run.
|
|
90
|
+
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
91
|
+
# polling the DB, which can be a bottleneck for high-concurrency
|
|
92
|
+
# requests.
|
|
91
93
|
await asyncio.sleep(0.1)
|
|
92
|
-
request_task = requests_lib.
|
|
94
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
93
95
|
if not follow:
|
|
94
96
|
break
|
|
95
97
|
if show_request_waiting_spinner:
|
|
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
151
153
|
line: Optional[bytes] = await f.readline()
|
|
152
154
|
if not line:
|
|
153
155
|
if request_id is not None:
|
|
154
|
-
request_task = requests_lib.
|
|
156
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
155
157
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
|
156
158
|
if (request_task.status ==
|
|
157
159
|
requests_lib.RequestStatus.CANCELLED):
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -35,7 +35,8 @@ install_requires = [
|
|
|
35
35
|
# Light weight requirement, can be replaced with "typing" once
|
|
36
36
|
# we deprecate Python 3.7 (this will take a while).
|
|
37
37
|
'typing_extensions',
|
|
38
|
-
|
|
38
|
+
# filelock 3.15.0 or higher is required for async file locking.
|
|
39
|
+
'filelock >= 3.15.0',
|
|
39
40
|
'packaging',
|
|
40
41
|
'psutil',
|
|
41
42
|
'pulp',
|
|
@@ -71,6 +72,8 @@ install_requires = [
|
|
|
71
72
|
'types-paramiko',
|
|
72
73
|
'alembic',
|
|
73
74
|
'aiohttp',
|
|
75
|
+
'aiosqlite',
|
|
76
|
+
'anyio',
|
|
74
77
|
]
|
|
75
78
|
|
|
76
79
|
# See requirements-dev.txt for the version of grpc and protobuf
|
|
@@ -92,8 +95,10 @@ server_dependencies = [
|
|
|
92
95
|
'passlib',
|
|
93
96
|
'pyjwt',
|
|
94
97
|
'aiohttp',
|
|
98
|
+
'anyio',
|
|
95
99
|
GRPC,
|
|
96
100
|
PROTOBUF,
|
|
101
|
+
'aiosqlite',
|
|
97
102
|
]
|
|
98
103
|
|
|
99
104
|
local_ray = [
|
sky/sky_logging.py
CHANGED
|
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
|
|
19
19
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
20
20
|
_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
|
|
21
21
|
|
|
22
|
+
_DEBUG_LOG_DIR = os.path.expanduser(
|
|
23
|
+
os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
|
|
24
|
+
|
|
22
25
|
DEBUG = logging.DEBUG
|
|
23
26
|
INFO = logging.INFO
|
|
24
27
|
WARNING = logging.WARNING
|
|
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
|
|
|
254
257
|
log_path = os.path.expanduser(os.path.join(log_dir, file_name))
|
|
255
258
|
|
|
256
259
|
return log_path
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@contextlib.contextmanager
|
|
263
|
+
def add_debug_log_handler(request_id: str):
|
|
264
|
+
if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
|
|
265
|
+
yield
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
|
|
269
|
+
log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
|
|
270
|
+
try:
|
|
271
|
+
debug_log_handler = logging.FileHandler(log_path)
|
|
272
|
+
debug_log_handler.setFormatter(FORMATTER)
|
|
273
|
+
debug_log_handler.setLevel(logging.DEBUG)
|
|
274
|
+
_root_logger.addHandler(debug_log_handler)
|
|
275
|
+
# sky.provision sets up its own logger/handler with propogate=False,
|
|
276
|
+
# so add it there too.
|
|
277
|
+
provision_logger = logging.getLogger('sky.provision')
|
|
278
|
+
provision_logger.addHandler(debug_log_handler)
|
|
279
|
+
provision_logger.setLevel(logging.DEBUG)
|
|
280
|
+
yield
|
|
281
|
+
finally:
|
|
282
|
+
_root_logger.removeHandler(debug_log_handler)
|
|
283
|
+
provision_logger.removeHandler(debug_log_handler)
|
|
284
|
+
debug_log_handler.close()
|
sky/skylet/constants.py
CHANGED
|
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
|
70
70
|
|
|
71
71
|
# Prefix for SkyPilot environment variables
|
|
72
72
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
73
|
+
SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
|
|
73
74
|
|
|
74
75
|
# The name for the environment variable that stores the unique ID of the
|
|
75
76
|
# current task. This will stay the same across multiple recoveries of the
|
|
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
|
|
417
418
|
# Path to the generated cluster config yamls and ssh configs.
|
|
418
419
|
SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
419
420
|
|
|
421
|
+
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
420
422
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
421
423
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
422
424
|
|
|
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
|
436
438
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
437
439
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
438
440
|
|
|
441
|
+
# Enable debug logging for requests.
|
|
442
|
+
ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
|
|
443
|
+
f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
|
|
444
|
+
|
|
439
445
|
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
440
446
|
|
|
441
447
|
# BEGIN constants used for service catalog.
|
sky/skylet/events.py
CHANGED
|
@@ -20,7 +20,6 @@ from sky.skylet import constants
|
|
|
20
20
|
from sky.skylet import job_lib
|
|
21
21
|
from sky.usage import usage_lib
|
|
22
22
|
from sky.utils import cluster_utils
|
|
23
|
-
from sky.utils import common_utils
|
|
24
23
|
from sky.utils import registry
|
|
25
24
|
from sky.utils import ux_utils
|
|
26
25
|
from sky.utils import yaml_utils
|
|
@@ -181,7 +180,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
181
180
|
|
|
182
181
|
config_path = os.path.abspath(
|
|
183
182
|
os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
|
|
184
|
-
config =
|
|
183
|
+
config = yaml_utils.read_yaml(config_path)
|
|
185
184
|
provider_name = cluster_utils.get_provider_name(config)
|
|
186
185
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
|
187
186
|
assert cloud is not None, f'Unknown cloud: {provider_name}'
|
|
@@ -326,5 +325,5 @@ class AutostopEvent(SkyletEvent):
|
|
|
326
325
|
config['auth'].pop('ssh_proxy_command', None)
|
|
327
326
|
# Empty the file_mounts.
|
|
328
327
|
config['file_mounts'] = {}
|
|
329
|
-
|
|
328
|
+
yaml_utils.dump_yaml(yaml_path, config)
|
|
330
329
|
logger.debug('Replaced upscaling speed to 0.')
|