skypilot-nightly 1.0.0.dev20250831__py3-none-any.whl → 1.0.0.dev20250901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +67 -0
- sky/jobs/server/server.py +2 -1
- sky/serve/server/server.py +2 -1
- sky/server/common.py +8 -6
- sky/server/metrics.py +82 -6
- sky/server/requests/executor.py +6 -2
- sky/server/requests/preconditions.py +3 -2
- sky/server/requests/requests.py +118 -29
- sky/server/server.py +50 -18
- sky/server/stream_utils.py +7 -5
- sky/server/uvicorn.py +7 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +3 -0
- sky/utils/db/db_utils.py +64 -1
- sky/utils/perf_utils.py +22 -0
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/METADATA +36 -33
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/RECORD +40 -39
- /sky/dashboard/out/_next/static/{FtHzmn6BMJ5PzqHhEY51g → EqPZ0ygxa__3XPBVJ9dpy}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{FtHzmn6BMJ5PzqHhEY51g → EqPZ0ygxa__3XPBVJ9dpy}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250901.dist-info}/top_level.txt +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -41,6 +41,7 @@ from sky import skypilot_config
|
|
|
41
41
|
from sky.server import common as server_common
|
|
42
42
|
from sky.server import config as server_config
|
|
43
43
|
from sky.server import constants as server_constants
|
|
44
|
+
from sky.server import metrics as metrics_lib
|
|
44
45
|
from sky.server.requests import payloads
|
|
45
46
|
from sky.server.requests import preconditions
|
|
46
47
|
from sky.server.requests import process
|
|
@@ -373,6 +374,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
373
374
|
request_task.status = api_requests.RequestStatus.RUNNING
|
|
374
375
|
func = request_task.entrypoint
|
|
375
376
|
request_body = request_task.request_body
|
|
377
|
+
request_name = request_task.name
|
|
376
378
|
|
|
377
379
|
# Append to the log file instead of overwriting it since there might be
|
|
378
380
|
# logs from previous retries.
|
|
@@ -390,7 +392,9 @@ def _request_execution_wrapper(request_id: str,
|
|
|
390
392
|
config = skypilot_config.to_dict()
|
|
391
393
|
logger.debug(f'request config: \n'
|
|
392
394
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
393
|
-
|
|
395
|
+
with metrics_lib.time_it(name=request_name,
|
|
396
|
+
group='request_execution'):
|
|
397
|
+
return_value = func(**request_body.to_kwargs())
|
|
394
398
|
f.flush()
|
|
395
399
|
except KeyboardInterrupt:
|
|
396
400
|
logger.info(f'Request {request_id} cancelled by user')
|
|
@@ -453,7 +457,7 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
453
457
|
**request_body.to_kwargs())
|
|
454
458
|
|
|
455
459
|
async def poll_task(request_id: str) -> bool:
|
|
456
|
-
request = api_requests.
|
|
460
|
+
request = await api_requests.get_request_async(request_id)
|
|
457
461
|
if request is None:
|
|
458
462
|
raise RuntimeError('Request not found')
|
|
459
463
|
|
|
@@ -98,7 +98,7 @@ class Precondition(abc.ABC):
|
|
|
98
98
|
return False
|
|
99
99
|
|
|
100
100
|
# Check if the request has been cancelled
|
|
101
|
-
request = api_requests.
|
|
101
|
+
request = await api_requests.get_request_async(self.request_id)
|
|
102
102
|
if request is None:
|
|
103
103
|
logger.error(f'Request {self.request_id} not found')
|
|
104
104
|
return False
|
|
@@ -112,7 +112,8 @@ class Precondition(abc.ABC):
|
|
|
112
112
|
return True
|
|
113
113
|
if status_msg is not None and status_msg != last_status_msg:
|
|
114
114
|
# Update the status message if it has changed.
|
|
115
|
-
with api_requests.
|
|
115
|
+
async with api_requests.update_request_async(
|
|
116
|
+
self.request_id) as req:
|
|
116
117
|
assert req is not None, self.request_id
|
|
117
118
|
req.status_msg = status_msg
|
|
118
119
|
last_status_msg = status_msg
|
sky/server/requests/requests.py
CHANGED
|
@@ -13,7 +13,8 @@ import sqlite3
|
|
|
13
13
|
import threading
|
|
14
14
|
import time
|
|
15
15
|
import traceback
|
|
16
|
-
from typing import Any, Callable, Dict, Generator, List,
|
|
16
|
+
from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
|
|
17
|
+
Optional, Tuple)
|
|
17
18
|
|
|
18
19
|
import colorama
|
|
19
20
|
import filelock
|
|
@@ -25,6 +26,7 @@ from sky import skypilot_config
|
|
|
25
26
|
from sky.server import common as server_common
|
|
26
27
|
from sky.server import constants as server_constants
|
|
27
28
|
from sky.server import daemons
|
|
29
|
+
from sky.server import metrics as metrics_lib
|
|
28
30
|
from sky.server.requests import payloads
|
|
29
31
|
from sky.server.requests.serializers import decoders
|
|
30
32
|
from sky.server.requests.serializers import encoders
|
|
@@ -402,26 +404,46 @@ _DB = None
|
|
|
402
404
|
_init_db_lock = threading.Lock()
|
|
403
405
|
|
|
404
406
|
|
|
407
|
+
def _init_db_within_lock():
|
|
408
|
+
global _DB
|
|
409
|
+
if _DB is None:
|
|
410
|
+
db_path = os.path.expanduser(
|
|
411
|
+
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
412
|
+
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
413
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
414
|
+
|
|
415
|
+
|
|
405
416
|
def init_db(func):
|
|
406
417
|
"""Initialize the database."""
|
|
407
418
|
|
|
408
419
|
@functools.wraps(func)
|
|
409
420
|
def wrapper(*args, **kwargs):
|
|
410
|
-
global _DB
|
|
411
421
|
if _DB is not None:
|
|
412
422
|
return func(*args, **kwargs)
|
|
413
423
|
with _init_db_lock:
|
|
414
|
-
|
|
415
|
-
db_path = os.path.expanduser(
|
|
416
|
-
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
417
|
-
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
|
418
|
-
exist_ok=True)
|
|
419
|
-
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
424
|
+
_init_db_within_lock()
|
|
420
425
|
return func(*args, **kwargs)
|
|
421
426
|
|
|
422
427
|
return wrapper
|
|
423
428
|
|
|
424
429
|
|
|
430
|
+
def init_db_async(func):
|
|
431
|
+
"""Async version of init_db."""
|
|
432
|
+
|
|
433
|
+
@functools.wraps(func)
|
|
434
|
+
async def wrapper(*args, **kwargs):
|
|
435
|
+
if _DB is not None:
|
|
436
|
+
return await func(*args, **kwargs)
|
|
437
|
+
# If _DB is not initialized, init_db_async will be blocked if there
|
|
438
|
+
# is a thread initializing _DB, this is fine since it occurs on process
|
|
439
|
+
# startup.
|
|
440
|
+
with _init_db_lock:
|
|
441
|
+
_init_db_within_lock()
|
|
442
|
+
return await func(*args, **kwargs)
|
|
443
|
+
|
|
444
|
+
return wrapper
|
|
445
|
+
|
|
446
|
+
|
|
425
447
|
def reset_db_and_logs():
|
|
426
448
|
"""Create the database."""
|
|
427
449
|
server_common.clear_local_api_server_database()
|
|
@@ -439,30 +461,66 @@ def request_lock_path(request_id: str) -> str:
|
|
|
439
461
|
|
|
440
462
|
@contextlib.contextmanager
|
|
441
463
|
@init_db
|
|
464
|
+
@metrics_lib.time_me
|
|
442
465
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
443
|
-
"""Get a SkyPilot API request."""
|
|
466
|
+
"""Get and update a SkyPilot API request."""
|
|
444
467
|
request = _get_request_no_lock(request_id)
|
|
445
468
|
yield request
|
|
446
469
|
if request is not None:
|
|
447
470
|
_add_or_update_request_no_lock(request)
|
|
448
471
|
|
|
449
472
|
|
|
473
|
+
@init_db
|
|
474
|
+
@metrics_lib.time_me
|
|
475
|
+
def update_request_async(
|
|
476
|
+
request_id: str) -> AsyncContextManager[Optional[Request]]:
|
|
477
|
+
"""Async version of update_request.
|
|
478
|
+
|
|
479
|
+
Returns an async context manager that yields the request record and
|
|
480
|
+
persists any in-place updates upon exit.
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
@contextlib.asynccontextmanager
|
|
484
|
+
async def _cm():
|
|
485
|
+
request = await _get_request_no_lock_async(request_id)
|
|
486
|
+
try:
|
|
487
|
+
yield request
|
|
488
|
+
finally:
|
|
489
|
+
if request is not None:
|
|
490
|
+
await _add_or_update_request_no_lock_async(request)
|
|
491
|
+
|
|
492
|
+
return _cm()
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
_get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
|
|
496
|
+
'WHERE request_id LIKE ?')
|
|
497
|
+
|
|
498
|
+
|
|
450
499
|
def _get_request_no_lock(request_id: str) -> Optional[Request]:
|
|
451
500
|
"""Get a SkyPilot API request."""
|
|
452
501
|
assert _DB is not None
|
|
453
|
-
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
454
502
|
with _DB.conn:
|
|
455
503
|
cursor = _DB.conn.cursor()
|
|
456
|
-
cursor.execute(
|
|
457
|
-
f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
458
|
-
'WHERE request_id LIKE ?', (request_id + '%',))
|
|
504
|
+
cursor.execute(_get_request_sql, (request_id + '%',))
|
|
459
505
|
row = cursor.fetchone()
|
|
460
506
|
if row is None:
|
|
461
507
|
return None
|
|
462
508
|
return Request.from_row(row)
|
|
463
509
|
|
|
464
510
|
|
|
511
|
+
async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
|
|
512
|
+
"""Async version of _get_request_no_lock."""
|
|
513
|
+
assert _DB is not None
|
|
514
|
+
async with _DB.execute_fetchall_async(_get_request_sql,
|
|
515
|
+
(request_id + '%',)) as rows:
|
|
516
|
+
row = rows[0] if rows else None
|
|
517
|
+
if row is None:
|
|
518
|
+
return None
|
|
519
|
+
return Request.from_row(row)
|
|
520
|
+
|
|
521
|
+
|
|
465
522
|
@init_db
|
|
523
|
+
@metrics_lib.time_me
|
|
466
524
|
def get_latest_request_id() -> Optional[str]:
|
|
467
525
|
"""Get the latest request ID."""
|
|
468
526
|
assert _DB is not None
|
|
@@ -475,13 +533,23 @@ def get_latest_request_id() -> Optional[str]:
|
|
|
475
533
|
|
|
476
534
|
|
|
477
535
|
@init_db
|
|
536
|
+
@metrics_lib.time_me
|
|
478
537
|
def get_request(request_id: str) -> Optional[Request]:
|
|
479
538
|
"""Get a SkyPilot API request."""
|
|
480
539
|
with filelock.FileLock(request_lock_path(request_id)):
|
|
481
540
|
return _get_request_no_lock(request_id)
|
|
482
541
|
|
|
483
542
|
|
|
543
|
+
@init_db_async
|
|
544
|
+
@metrics_lib.time_me_async
|
|
545
|
+
async def get_request_async(request_id: str) -> Optional[Request]:
|
|
546
|
+
"""Async version of get_request."""
|
|
547
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
548
|
+
return await _get_request_no_lock_async(request_id)
|
|
549
|
+
|
|
550
|
+
|
|
484
551
|
@init_db
|
|
552
|
+
@metrics_lib.time_me
|
|
485
553
|
def create_if_not_exists(request: Request) -> bool:
|
|
486
554
|
"""Create a SkyPilot API request if it does not exist."""
|
|
487
555
|
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
@@ -491,7 +559,19 @@ def create_if_not_exists(request: Request) -> bool:
|
|
|
491
559
|
return True
|
|
492
560
|
|
|
493
561
|
|
|
562
|
+
@init_db_async
|
|
563
|
+
@metrics_lib.time_me_async
|
|
564
|
+
async def create_if_not_exists_async(request: Request) -> bool:
|
|
565
|
+
"""Async version of create_if_not_exists."""
|
|
566
|
+
async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
|
|
567
|
+
if await _get_request_no_lock_async(request.request_id) is not None:
|
|
568
|
+
return False
|
|
569
|
+
await _add_or_update_request_no_lock_async(request)
|
|
570
|
+
return True
|
|
571
|
+
|
|
572
|
+
|
|
494
573
|
@init_db
|
|
574
|
+
@metrics_lib.time_me
|
|
495
575
|
def get_request_tasks(
|
|
496
576
|
status: Optional[List[RequestStatus]] = None,
|
|
497
577
|
cluster_names: Optional[List[str]] = None,
|
|
@@ -565,16 +645,15 @@ def get_request_tasks(
|
|
|
565
645
|
return requests
|
|
566
646
|
|
|
567
647
|
|
|
568
|
-
@
|
|
569
|
-
|
|
648
|
+
@init_db_async
|
|
649
|
+
@metrics_lib.time_me_async
|
|
650
|
+
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
570
651
|
"""Get a list of API request ids for shell completion."""
|
|
571
652
|
assert _DB is not None
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
cursor.execute(
|
|
577
|
-
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
653
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
654
|
+
# then order by creation time (newest first) within each category.
|
|
655
|
+
async with _DB.execute_fetchall_async(
|
|
656
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
578
657
|
WHERE request_id LIKE ?
|
|
579
658
|
ORDER BY
|
|
580
659
|
CASE
|
|
@@ -582,21 +661,30 @@ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
|
582
661
|
ELSE 1
|
|
583
662
|
END,
|
|
584
663
|
created_at DESC
|
|
585
|
-
LIMIT 1000""", (f'{incomplete}%',))
|
|
586
|
-
|
|
664
|
+
LIMIT 1000""", (f'{incomplete}%',)) as rows:
|
|
665
|
+
if not rows:
|
|
666
|
+
return []
|
|
667
|
+
return [row[0] for row in rows]
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
_add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
|
|
671
|
+
f'({", ".join(REQUEST_COLUMNS)}) VALUES '
|
|
672
|
+
f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
|
|
587
673
|
|
|
588
674
|
|
|
589
675
|
def _add_or_update_request_no_lock(request: Request):
|
|
590
676
|
"""Add or update a REST request into the database."""
|
|
591
|
-
row = request.to_row()
|
|
592
|
-
key_str = ', '.join(REQUEST_COLUMNS)
|
|
593
|
-
fill_str = ', '.join(['?'] * len(row))
|
|
594
677
|
assert _DB is not None
|
|
595
678
|
with _DB.conn:
|
|
596
679
|
cursor = _DB.conn.cursor()
|
|
597
|
-
cursor.execute(
|
|
598
|
-
|
|
599
|
-
|
|
680
|
+
cursor.execute(_add_or_update_request_sql, request.to_row())
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
async def _add_or_update_request_no_lock_async(request: Request):
|
|
684
|
+
"""Async version of _add_or_update_request_no_lock."""
|
|
685
|
+
assert _DB is not None
|
|
686
|
+
await _DB.execute_and_commit_async(_add_or_update_request_sql,
|
|
687
|
+
request.to_row())
|
|
600
688
|
|
|
601
689
|
|
|
602
690
|
def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
@@ -630,6 +718,7 @@ def set_request_cancelled(request_id: str) -> None:
|
|
|
630
718
|
|
|
631
719
|
|
|
632
720
|
@init_db
|
|
721
|
+
@metrics_lib.time_me
|
|
633
722
|
def _delete_requests(requests: List[Request]):
|
|
634
723
|
"""Clean up requests by their IDs."""
|
|
635
724
|
id_list_str = ','.join(repr(req.request_id) for req in requests)
|
sky/server/server.py
CHANGED
|
@@ -68,6 +68,7 @@ from sky.utils import common_utils
|
|
|
68
68
|
from sky.utils import context
|
|
69
69
|
from sky.utils import context_utils
|
|
70
70
|
from sky.utils import dag_utils
|
|
71
|
+
from sky.utils import perf_utils
|
|
71
72
|
from sky.utils import status_lib
|
|
72
73
|
from sky.utils import subprocess_utils
|
|
73
74
|
from sky.volumes.server import server as volumes_rest
|
|
@@ -421,6 +422,28 @@ async def cleanup_upload_ids():
|
|
|
421
422
|
upload_ids_to_cleanup.pop((upload_id, user_hash))
|
|
422
423
|
|
|
423
424
|
|
|
425
|
+
async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
426
|
+
interval: float = 0.1) -> None:
|
|
427
|
+
target = loop.time() + interval
|
|
428
|
+
|
|
429
|
+
pid = str(os.getpid())
|
|
430
|
+
lag_threshold = perf_utils.get_loop_lag_threshold()
|
|
431
|
+
|
|
432
|
+
def tick():
|
|
433
|
+
nonlocal target
|
|
434
|
+
now = loop.time()
|
|
435
|
+
lag = max(0.0, now - target)
|
|
436
|
+
if lag_threshold is not None and lag > lag_threshold:
|
|
437
|
+
logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
|
|
438
|
+
f'{lag_threshold} seconds.')
|
|
439
|
+
metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
|
|
440
|
+
pid=pid).observe(lag)
|
|
441
|
+
target = now + interval
|
|
442
|
+
loop.call_at(target, tick)
|
|
443
|
+
|
|
444
|
+
loop.call_at(target, tick)
|
|
445
|
+
|
|
446
|
+
|
|
424
447
|
@contextlib.asynccontextmanager
|
|
425
448
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
426
449
|
"""FastAPI lifespan context manager."""
|
|
@@ -446,6 +469,10 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
446
469
|
# can safely ignore the error if the task is already scheduled.
|
|
447
470
|
logger.debug(f'Request {event.id} already exists.')
|
|
448
471
|
asyncio.create_task(cleanup_upload_ids())
|
|
472
|
+
if metrics.METRICS_ENABLED:
|
|
473
|
+
# Start monitoring the event loop lag in each server worker
|
|
474
|
+
# event loop (process).
|
|
475
|
+
asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
|
|
449
476
|
yield
|
|
450
477
|
# Shutdown: Add any cleanup code here if needed
|
|
451
478
|
|
|
@@ -1254,20 +1281,25 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1254
1281
|
logs_dir_on_api_server).expanduser().resolve() / zip_filename
|
|
1255
1282
|
|
|
1256
1283
|
try:
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1284
|
+
|
|
1285
|
+
def _zip_files_and_folders(folder_paths, zip_path):
|
|
1286
|
+
folders = [
|
|
1287
|
+
str(folder_path.expanduser().resolve())
|
|
1288
|
+
for folder_path in folder_paths
|
|
1289
|
+
]
|
|
1290
|
+
# Check for optional query parameter to control zip entry structure
|
|
1291
|
+
relative = request.query_params.get('relative', 'home')
|
|
1292
|
+
if relative == 'items':
|
|
1293
|
+
# Dashboard-friendly: entries relative to selected folders
|
|
1294
|
+
storage_utils.zip_files_and_folders(folders,
|
|
1295
|
+
zip_path,
|
|
1296
|
+
relative_to_items=True)
|
|
1297
|
+
else:
|
|
1298
|
+
# CLI-friendly (default): entries with full paths for mapping
|
|
1299
|
+
storage_utils.zip_files_and_folders(folders, zip_path)
|
|
1300
|
+
|
|
1301
|
+
await context_utils.to_thread(_zip_files_and_folders, folder_paths,
|
|
1302
|
+
zip_path)
|
|
1271
1303
|
|
|
1272
1304
|
# Add home path to the response headers, so that the client can replace
|
|
1273
1305
|
# the remote path in the zip file to the local path.
|
|
@@ -1397,7 +1429,7 @@ async def local_down(request: fastapi.Request) -> None:
|
|
|
1397
1429
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1398
1430
|
"""Gets a request with a given request ID prefix."""
|
|
1399
1431
|
while True:
|
|
1400
|
-
request_task = requests_lib.
|
|
1432
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1401
1433
|
if request_task is None:
|
|
1402
1434
|
print(f'No task with request ID {request_id}', flush=True)
|
|
1403
1435
|
raise fastapi.HTTPException(
|
|
@@ -1486,7 +1518,7 @@ async def stream(
|
|
|
1486
1518
|
|
|
1487
1519
|
# Original plain text streaming logic
|
|
1488
1520
|
if request_id is not None:
|
|
1489
|
-
request_task = requests_lib.
|
|
1521
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1490
1522
|
if request_task is None:
|
|
1491
1523
|
print(f'No task with request ID {request_id}')
|
|
1492
1524
|
raise fastapi.HTTPException(
|
|
@@ -1581,7 +1613,7 @@ async def api_status(
|
|
|
1581
1613
|
else:
|
|
1582
1614
|
encoded_request_tasks = []
|
|
1583
1615
|
for request_id in request_ids:
|
|
1584
|
-
request_task = requests_lib.
|
|
1616
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1585
1617
|
if request_task is None:
|
|
1586
1618
|
continue
|
|
1587
1619
|
encoded_request_tasks.append(request_task.readable_encode())
|
|
@@ -1791,7 +1823,7 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
|
|
|
1791
1823
|
|
|
1792
1824
|
@app.get('/api/completion/api_request')
|
|
1793
1825
|
async def complete_api_request(incomplete: str,) -> List[str]:
|
|
1794
|
-
return requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1826
|
+
return await requests_lib.get_api_request_ids_start_with(incomplete)
|
|
1795
1827
|
|
|
1796
1828
|
|
|
1797
1829
|
@app.get('/dashboard/{full_path:path}')
|
sky/server/stream_utils.py
CHANGED
|
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
|
|
|
56
56
|
if request_id is not None:
|
|
57
57
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
58
58
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
59
|
-
request_task = requests_lib.
|
|
59
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
60
60
|
|
|
61
61
|
if request_task is None:
|
|
62
62
|
raise fastapi.HTTPException(
|
|
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
|
|
|
86
86
|
# Use smaller padding (1024 bytes) to force browser rendering
|
|
87
87
|
yield f'{waiting_msg}' + ' ' * 4096 + '\n'
|
|
88
88
|
# Sleep shortly to avoid storming the DB and CPU and allow other
|
|
89
|
-
# coroutines to run.
|
|
90
|
-
#
|
|
89
|
+
# coroutines to run.
|
|
90
|
+
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
91
|
+
# polling the DB, which can be a bottleneck for high-concurrency
|
|
92
|
+
# requests.
|
|
91
93
|
await asyncio.sleep(0.1)
|
|
92
|
-
request_task = requests_lib.
|
|
94
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
93
95
|
if not follow:
|
|
94
96
|
break
|
|
95
97
|
if show_request_waiting_spinner:
|
|
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
151
153
|
line: Optional[bytes] = await f.readline()
|
|
152
154
|
if not line:
|
|
153
155
|
if request_id is not None:
|
|
154
|
-
request_task = requests_lib.
|
|
156
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
155
157
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
|
156
158
|
if (request_task.status ==
|
|
157
159
|
requests_lib.RequestStatus.CANCELLED):
|
sky/server/uvicorn.py
CHANGED
|
@@ -24,6 +24,7 @@ from sky.server.requests import requests as requests_lib
|
|
|
24
24
|
from sky.skylet import constants
|
|
25
25
|
from sky.utils import context_utils
|
|
26
26
|
from sky.utils import env_options
|
|
27
|
+
from sky.utils import perf_utils
|
|
27
28
|
from sky.utils import subprocess_utils
|
|
28
29
|
|
|
29
30
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -198,6 +199,12 @@ class Server(uvicorn.Server):
|
|
|
198
199
|
context_utils.hijack_sys_attrs()
|
|
199
200
|
# Use default loop policy of uvicorn (use uvloop if available).
|
|
200
201
|
self.config.setup_event_loop()
|
|
202
|
+
lag_threshold = perf_utils.get_loop_lag_threshold()
|
|
203
|
+
if lag_threshold is not None:
|
|
204
|
+
event_loop = asyncio.get_event_loop()
|
|
205
|
+
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
206
|
+
event_loop.set_debug(True)
|
|
207
|
+
event_loop.slow_callback_duration = lag_threshold
|
|
201
208
|
with self.capture_signals():
|
|
202
209
|
asyncio.run(self.serve(*args, **kwargs))
|
|
203
210
|
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -39,7 +39,8 @@ install_requires = [
|
|
|
39
39
|
# Light weight requirement, can be replaced with "typing" once
|
|
40
40
|
# we deprecate Python 3.7 (this will take a while).
|
|
41
41
|
'typing_extensions',
|
|
42
|
-
|
|
42
|
+
# filelock 3.15.0 or higher is required for async file locking.
|
|
43
|
+
'filelock >= 3.15.0',
|
|
43
44
|
'packaging',
|
|
44
45
|
'psutil',
|
|
45
46
|
'pulp',
|
|
@@ -75,6 +76,7 @@ install_requires = [
|
|
|
75
76
|
'types-paramiko',
|
|
76
77
|
'alembic',
|
|
77
78
|
'aiohttp',
|
|
79
|
+
'aiosqlite',
|
|
78
80
|
'anyio',
|
|
79
81
|
]
|
|
80
82
|
|
|
@@ -100,6 +102,7 @@ server_dependencies = [
|
|
|
100
102
|
'anyio',
|
|
101
103
|
GRPC,
|
|
102
104
|
PROTOBUF,
|
|
105
|
+
'aiosqlite',
|
|
103
106
|
]
|
|
104
107
|
|
|
105
108
|
local_ray = [
|
sky/skylet/constants.py
CHANGED
sky/utils/db/db_utils.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""Utils for sky databases."""
|
|
2
|
+
import asyncio
|
|
2
3
|
import contextlib
|
|
3
4
|
import enum
|
|
4
5
|
import sqlite3
|
|
5
6
|
import threading
|
|
6
7
|
import typing
|
|
7
|
-
from typing import Any, Callable, Optional
|
|
8
|
+
from typing import Any, Callable, Iterable, Optional
|
|
8
9
|
|
|
10
|
+
import aiosqlite
|
|
11
|
+
import aiosqlite.context
|
|
9
12
|
import sqlalchemy
|
|
10
13
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
11
14
|
|
|
@@ -283,3 +286,63 @@ class SQLiteConn(threading.local):
|
|
|
283
286
|
self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
|
284
287
|
self.cursor = self.conn.cursor()
|
|
285
288
|
create_table(self.cursor, self.conn)
|
|
289
|
+
self._async_conn: Optional[aiosqlite.Connection] = None
|
|
290
|
+
self._async_conn_lock: Optional[asyncio.Lock] = None
|
|
291
|
+
|
|
292
|
+
async def _get_async_conn(self) -> aiosqlite.Connection:
|
|
293
|
+
"""Get the shared aiosqlite connection for current thread.
|
|
294
|
+
|
|
295
|
+
Typically, external caller should not get the connection directly,
|
|
296
|
+
instead, SQLiteConn.{operation}_async methods should be used. This
|
|
297
|
+
is to avoid txn interleaving on the shared aiosqlite connection.
|
|
298
|
+
E.g.
|
|
299
|
+
coroutine 1:
|
|
300
|
+
A: await write(row1)
|
|
301
|
+
B: cursor = await conn.execute(read_row1)
|
|
302
|
+
C: await cursor.fetchall()
|
|
303
|
+
coroutine 2:
|
|
304
|
+
D: await write(row2)
|
|
305
|
+
E: cursor = await conn.execute(read_row2)
|
|
306
|
+
F: await cursor.fetchall()
|
|
307
|
+
The A -> B -> D -> E -> C time sequence will cause B and D read at the
|
|
308
|
+
same snapshot point when B started, thus cause coroutine2 lost the
|
|
309
|
+
read-after-write consistency. When you are adding new async operations
|
|
310
|
+
to SQLiteConn, make sure the txn pattern does not cause this issue.
|
|
311
|
+
"""
|
|
312
|
+
# Python 3.8 binds current event loop to asyncio.Lock(), which requires
|
|
313
|
+
# a loop available in current thread. Lazy-init the lock to avoid this
|
|
314
|
+
# dependency. The correctness is guranteed since SQLiteConn is
|
|
315
|
+
# thread-local so there is no race condition between check and init.
|
|
316
|
+
if self._async_conn_lock is None:
|
|
317
|
+
self._async_conn_lock = asyncio.Lock()
|
|
318
|
+
if self._async_conn is None:
|
|
319
|
+
async with self._async_conn_lock:
|
|
320
|
+
if self._async_conn is None:
|
|
321
|
+
# Init logic like requests.init_db_within_lock will handle
|
|
322
|
+
# initialization like setting the WAL mode, so we do not
|
|
323
|
+
# duplicate that logic here.
|
|
324
|
+
self._async_conn = await aiosqlite.connect(self.db_path)
|
|
325
|
+
return self._async_conn
|
|
326
|
+
|
|
327
|
+
async def execute_and_commit_async(self,
|
|
328
|
+
sql: str,
|
|
329
|
+
parameters: Optional[
|
|
330
|
+
Iterable[Any]] = None) -> None:
|
|
331
|
+
"""Execute the sql and commit the transaction in a sync block."""
|
|
332
|
+
conn = await self._get_async_conn()
|
|
333
|
+
|
|
334
|
+
def exec_and_commit(sql: str, parameters: Optional[Iterable[Any]]):
|
|
335
|
+
# pylint: disable=protected-access
|
|
336
|
+
conn._conn.execute(sql, parameters)
|
|
337
|
+
conn._conn.commit()
|
|
338
|
+
|
|
339
|
+
# pylint: disable=protected-access
|
|
340
|
+
await conn._execute(exec_and_commit, sql, parameters)
|
|
341
|
+
|
|
342
|
+
@aiosqlite.context.contextmanager
|
|
343
|
+
async def execute_fetchall_async(self,
|
|
344
|
+
sql: str,
|
|
345
|
+
parameters: Optional[Iterable[Any]] = None
|
|
346
|
+
) -> Iterable[sqlite3.Row]:
|
|
347
|
+
conn = await self._get_async_conn()
|
|
348
|
+
return await conn.execute_fetchall(sql, parameters)
|
sky/utils/perf_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Utility functions for performance monitoring."""
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from sky import sky_logging
|
|
6
|
+
from sky.skylet import constants
|
|
7
|
+
|
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_loop_lag_threshold() -> Optional[float]:
|
|
12
|
+
"""Get the loop lag threshold from the environment variable."""
|
|
13
|
+
lag_threshold = os.getenv(constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS, None)
|
|
14
|
+
if lag_threshold is not None:
|
|
15
|
+
try:
|
|
16
|
+
return float(lag_threshold) / 1000.0
|
|
17
|
+
except ValueError:
|
|
18
|
+
logger.warning(
|
|
19
|
+
f'Invalid value for {constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS}:'
|
|
20
|
+
f' {lag_threshold}')
|
|
21
|
+
return None
|
|
22
|
+
return None
|