skypilot-nightly 1.0.0.dev20250831__py3-none-any.whl → 1.0.0.dev20250902__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (45) hide show
  1. sky/__init__.py +2 -2
  2. sky/dashboard/out/404.html +1 -1
  3. sky/dashboard/out/_next/static/chunks/3015-8089ed1e0b7e37fd.js +1 -0
  4. sky/dashboard/out/_next/static/chunks/webpack-0eaa6f7e63f51311.js +1 -0
  5. sky/dashboard/out/_next/static/{FtHzmn6BMJ5PzqHhEY51g → tio0QibqY2C0F2-rPy00p}/_buildManifest.js +1 -1
  6. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  7. sky/dashboard/out/clusters/[cluster].html +1 -1
  8. sky/dashboard/out/clusters.html +1 -1
  9. sky/dashboard/out/config.html +1 -1
  10. sky/dashboard/out/index.html +1 -1
  11. sky/dashboard/out/infra/[context].html +1 -1
  12. sky/dashboard/out/infra.html +1 -1
  13. sky/dashboard/out/jobs/[job].html +1 -1
  14. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  15. sky/dashboard/out/jobs.html +1 -1
  16. sky/dashboard/out/users.html +1 -1
  17. sky/dashboard/out/volumes.html +1 -1
  18. sky/dashboard/out/workspace/new.html +1 -1
  19. sky/dashboard/out/workspaces/[name].html +1 -1
  20. sky/dashboard/out/workspaces.html +1 -1
  21. sky/global_user_state.py +67 -0
  22. sky/jobs/server/server.py +2 -1
  23. sky/serve/server/server.py +2 -1
  24. sky/server/auth/oauth2_proxy.py +6 -0
  25. sky/server/common.py +8 -6
  26. sky/server/metrics.py +82 -6
  27. sky/server/requests/executor.py +6 -2
  28. sky/server/requests/preconditions.py +3 -2
  29. sky/server/requests/requests.py +118 -29
  30. sky/server/server.py +50 -18
  31. sky/server/stream_utils.py +7 -5
  32. sky/server/uvicorn.py +7 -0
  33. sky/setup_files/dependencies.py +4 -1
  34. sky/skylet/constants.py +3 -0
  35. sky/utils/db/db_utils.py +64 -1
  36. sky/utils/perf_utils.py +22 -0
  37. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/METADATA +38 -35
  38. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/RECORD +43 -42
  39. sky/dashboard/out/_next/static/chunks/3015-6c9c09593b1e67b6.js +0 -1
  40. sky/dashboard/out/_next/static/chunks/webpack-6e76f636a048e145.js +0 -1
  41. /sky/dashboard/out/_next/static/{FtHzmn6BMJ5PzqHhEY51g → tio0QibqY2C0F2-rPy00p}/_ssgManifest.js +0 -0
  42. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/WHEEL +0 -0
  43. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/entry_points.txt +0 -0
  44. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/licenses/LICENSE +0 -0
  45. {skypilot_nightly-1.0.0.dev20250831.dist-info → skypilot_nightly-1.0.0.dev20250902.dist-info}/top_level.txt +0 -0
sky/server/metrics.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
+ import contextlib
4
+ import functools
3
5
  import os
4
6
  import time
5
7
 
@@ -11,11 +13,16 @@ import starlette.middleware.base
11
13
  import uvicorn
12
14
 
13
15
  from sky import sky_logging
16
+ from sky.skylet import constants
17
+
18
+ # Whether the metrics are enabled, cannot be changed at runtime.
19
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
20
+ 'false').lower() == 'true'
14
21
 
15
22
  logger = sky_logging.init_logger(__name__)
16
23
 
17
24
  # Total number of API server requests, grouped by path, method, and status.
18
- sky_apiserver_requests_total = prom.Counter(
25
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
19
26
  'sky_apiserver_requests_total',
20
27
  'Total number of API server requests',
21
28
  ['path', 'method', 'status'],
@@ -23,14 +30,40 @@ sky_apiserver_requests_total = prom.Counter(
23
30
 
24
31
  # Time spent processing API server requests, grouped by path, method, and
25
32
  # status.
26
- sky_apiserver_request_duration_seconds = prom.Histogram(
33
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
27
34
  'sky_apiserver_request_duration_seconds',
28
35
  'Time spent processing API server requests',
29
36
  ['path', 'method', 'status'],
30
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
37
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
38
+ 60.0, 120.0, float('inf')),
39
+ )
40
+
41
+ # Time spent processing requests in executor.
42
+ SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
43
+ 'sky_apiserver_request_execution_duration_seconds',
44
+ 'Time spent executing requests in executor',
45
+ ['request', 'worker'],
46
+ buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
31
47
  float('inf')),
32
48
  )
33
49
 
50
+ # Time spent processing a piece of code, refer to time_it().
51
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
52
+ 'sky_apiserver_code_duration_seconds',
53
+ 'Time spent processing code',
54
+ ['name', 'group'],
55
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
56
+ 60.0, 120.0, float('inf')),
57
+ )
58
+
59
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
60
+ 'sky_apiserver_event_loop_lag_seconds',
61
+ 'Scheduling delay of the server event loop',
62
+ ['pid'],
63
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
64
+ 60.0, float('inf')),
65
+ )
66
+
34
67
  metrics_app = fastapi.FastAPI()
35
68
 
36
69
 
@@ -76,7 +109,7 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
76
109
 
77
110
  async def dispatch(self, request: fastapi.Request, call_next):
78
111
  path = request.url.path
79
- logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
112
+ logger.debug(f'PROM Middleware Request: {request}, {request.url.path}')
80
113
  streaming = _is_streaming_api(path)
81
114
  if not streaming:
82
115
  # Exclude streaming APIs, the duration is not meaningful.
@@ -92,13 +125,56 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
92
125
  status_code_group = '5xx'
93
126
  raise
94
127
  finally:
95
- sky_apiserver_requests_total.labels(path=path,
128
+ SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
96
129
  method=method,
97
130
  status=status_code_group).inc()
98
131
  if not streaming:
99
132
  duration = time.time() - start_time
100
- sky_apiserver_request_duration_seconds.labels(
133
+ SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
101
134
  path=path, method=method,
102
135
  status=status_code_group).observe(duration)
103
136
 
104
137
  return response
138
+
139
+
140
+ @contextlib.contextmanager
141
+ def time_it(name: str, group: str = 'default'):
142
+ """Context manager to measure and record code execution duration."""
143
+ if not METRICS_ENABLED:
144
+ yield
145
+ else:
146
+ start_time = time.time()
147
+ try:
148
+ yield
149
+ finally:
150
+ duration = time.time() - start_time
151
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
152
+ name=name, group=group).observe(duration)
153
+
154
+
155
+ def time_me(func):
156
+ """Measure the duration of decorated function."""
157
+
158
+ @functools.wraps(func)
159
+ def wrapper(*args, **kwargs):
160
+ if not METRICS_ENABLED:
161
+ return func(*args, **kwargs)
162
+ name = f'{func.__module__}/{func.__name__}'
163
+ with time_it(name, group='function'):
164
+ return func(*args, **kwargs)
165
+
166
+ return wrapper
167
+
168
+
169
+ def time_me_async(func):
170
+ """Measure the duration of decorated async function."""
171
+
172
+ @functools.wraps(func)
173
+ async def async_wrapper(*args, **kwargs):
174
+ if not METRICS_ENABLED:
175
+ return await func(*args, **kwargs)
176
+ name = f'{func.__module__}/{func.__name__}'
177
+ with time_it(name, group='function'):
178
+ return await func(*args, **kwargs)
179
+
180
+ return async_wrapper
@@ -41,6 +41,7 @@ from sky import skypilot_config
41
41
  from sky.server import common as server_common
42
42
  from sky.server import config as server_config
43
43
  from sky.server import constants as server_constants
44
+ from sky.server import metrics as metrics_lib
44
45
  from sky.server.requests import payloads
45
46
  from sky.server.requests import preconditions
46
47
  from sky.server.requests import process
@@ -373,6 +374,7 @@ def _request_execution_wrapper(request_id: str,
373
374
  request_task.status = api_requests.RequestStatus.RUNNING
374
375
  func = request_task.entrypoint
375
376
  request_body = request_task.request_body
377
+ request_name = request_task.name
376
378
 
377
379
  # Append to the log file instead of overwriting it since there might be
378
380
  # logs from previous retries.
@@ -390,7 +392,9 @@ def _request_execution_wrapper(request_id: str,
390
392
  config = skypilot_config.to_dict()
391
393
  logger.debug(f'request config: \n'
392
394
  f'{yaml_utils.dump_yaml_str(dict(config))}')
393
- return_value = func(**request_body.to_kwargs())
395
+ with metrics_lib.time_it(name=request_name,
396
+ group='request_execution'):
397
+ return_value = func(**request_body.to_kwargs())
394
398
  f.flush()
395
399
  except KeyboardInterrupt:
396
400
  logger.info(f'Request {request_id} cancelled by user')
@@ -453,7 +457,7 @@ async def execute_request_coroutine(request: api_requests.Request):
453
457
  **request_body.to_kwargs())
454
458
 
455
459
  async def poll_task(request_id: str) -> bool:
456
- request = api_requests.get_request(request_id)
460
+ request = await api_requests.get_request_async(request_id)
457
461
  if request is None:
458
462
  raise RuntimeError('Request not found')
459
463
 
@@ -98,7 +98,7 @@ class Precondition(abc.ABC):
98
98
  return False
99
99
 
100
100
  # Check if the request has been cancelled
101
- request = api_requests.get_request(self.request_id)
101
+ request = await api_requests.get_request_async(self.request_id)
102
102
  if request is None:
103
103
  logger.error(f'Request {self.request_id} not found')
104
104
  return False
@@ -112,7 +112,8 @@ class Precondition(abc.ABC):
112
112
  return True
113
113
  if status_msg is not None and status_msg != last_status_msg:
114
114
  # Update the status message if it has changed.
115
- with api_requests.update_request(self.request_id) as req:
115
+ async with api_requests.update_request_async(
116
+ self.request_id) as req:
116
117
  assert req is not None, self.request_id
117
118
  req.status_msg = status_msg
118
119
  last_status_msg = status_msg
@@ -13,7 +13,8 @@ import sqlite3
13
13
  import threading
14
14
  import time
15
15
  import traceback
16
- from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
16
+ from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
17
+ Optional, Tuple)
17
18
 
18
19
  import colorama
19
20
  import filelock
@@ -25,6 +26,7 @@ from sky import skypilot_config
25
26
  from sky.server import common as server_common
26
27
  from sky.server import constants as server_constants
27
28
  from sky.server import daemons
29
+ from sky.server import metrics as metrics_lib
28
30
  from sky.server.requests import payloads
29
31
  from sky.server.requests.serializers import decoders
30
32
  from sky.server.requests.serializers import encoders
@@ -402,26 +404,46 @@ _DB = None
402
404
  _init_db_lock = threading.Lock()
403
405
 
404
406
 
407
+ def _init_db_within_lock():
408
+ global _DB
409
+ if _DB is None:
410
+ db_path = os.path.expanduser(
411
+ server_constants.API_SERVER_REQUEST_DB_PATH)
412
+ pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
413
+ _DB = db_utils.SQLiteConn(db_path, create_table)
414
+
415
+
405
416
  def init_db(func):
406
417
  """Initialize the database."""
407
418
 
408
419
  @functools.wraps(func)
409
420
  def wrapper(*args, **kwargs):
410
- global _DB
411
421
  if _DB is not None:
412
422
  return func(*args, **kwargs)
413
423
  with _init_db_lock:
414
- if _DB is None:
415
- db_path = os.path.expanduser(
416
- server_constants.API_SERVER_REQUEST_DB_PATH)
417
- pathlib.Path(db_path).parents[0].mkdir(parents=True,
418
- exist_ok=True)
419
- _DB = db_utils.SQLiteConn(db_path, create_table)
424
+ _init_db_within_lock()
420
425
  return func(*args, **kwargs)
421
426
 
422
427
  return wrapper
423
428
 
424
429
 
430
+ def init_db_async(func):
431
+ """Async version of init_db."""
432
+
433
+ @functools.wraps(func)
434
+ async def wrapper(*args, **kwargs):
435
+ if _DB is not None:
436
+ return await func(*args, **kwargs)
437
+ # If _DB is not initialized, init_db_async will be blocked if there
438
+ # is a thread initializing _DB, this is fine since it occurs on process
439
+ # startup.
440
+ with _init_db_lock:
441
+ _init_db_within_lock()
442
+ return await func(*args, **kwargs)
443
+
444
+ return wrapper
445
+
446
+
425
447
  def reset_db_and_logs():
426
448
  """Create the database."""
427
449
  server_common.clear_local_api_server_database()
@@ -439,30 +461,66 @@ def request_lock_path(request_id: str) -> str:
439
461
 
440
462
  @contextlib.contextmanager
441
463
  @init_db
464
+ @metrics_lib.time_me
442
465
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
443
- """Get a SkyPilot API request."""
466
+ """Get and update a SkyPilot API request."""
444
467
  request = _get_request_no_lock(request_id)
445
468
  yield request
446
469
  if request is not None:
447
470
  _add_or_update_request_no_lock(request)
448
471
 
449
472
 
473
+ @init_db
474
+ @metrics_lib.time_me
475
+ def update_request_async(
476
+ request_id: str) -> AsyncContextManager[Optional[Request]]:
477
+ """Async version of update_request.
478
+
479
+ Returns an async context manager that yields the request record and
480
+ persists any in-place updates upon exit.
481
+ """
482
+
483
+ @contextlib.asynccontextmanager
484
+ async def _cm():
485
+ request = await _get_request_no_lock_async(request_id)
486
+ try:
487
+ yield request
488
+ finally:
489
+ if request is not None:
490
+ await _add_or_update_request_no_lock_async(request)
491
+
492
+ return _cm()
493
+
494
+
495
+ _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
496
+ 'WHERE request_id LIKE ?')
497
+
498
+
450
499
  def _get_request_no_lock(request_id: str) -> Optional[Request]:
451
500
  """Get a SkyPilot API request."""
452
501
  assert _DB is not None
453
- columns_str = ', '.join(REQUEST_COLUMNS)
454
502
  with _DB.conn:
455
503
  cursor = _DB.conn.cursor()
456
- cursor.execute(
457
- f'SELECT {columns_str} FROM {REQUEST_TABLE} '
458
- 'WHERE request_id LIKE ?', (request_id + '%',))
504
+ cursor.execute(_get_request_sql, (request_id + '%',))
459
505
  row = cursor.fetchone()
460
506
  if row is None:
461
507
  return None
462
508
  return Request.from_row(row)
463
509
 
464
510
 
511
+ async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
512
+ """Async version of _get_request_no_lock."""
513
+ assert _DB is not None
514
+ async with _DB.execute_fetchall_async(_get_request_sql,
515
+ (request_id + '%',)) as rows:
516
+ row = rows[0] if rows else None
517
+ if row is None:
518
+ return None
519
+ return Request.from_row(row)
520
+
521
+
465
522
  @init_db
523
+ @metrics_lib.time_me
466
524
  def get_latest_request_id() -> Optional[str]:
467
525
  """Get the latest request ID."""
468
526
  assert _DB is not None
@@ -475,13 +533,23 @@ def get_latest_request_id() -> Optional[str]:
475
533
 
476
534
 
477
535
  @init_db
536
+ @metrics_lib.time_me
478
537
  def get_request(request_id: str) -> Optional[Request]:
479
538
  """Get a SkyPilot API request."""
480
539
  with filelock.FileLock(request_lock_path(request_id)):
481
540
  return _get_request_no_lock(request_id)
482
541
 
483
542
 
543
+ @init_db_async
544
+ @metrics_lib.time_me_async
545
+ async def get_request_async(request_id: str) -> Optional[Request]:
546
+ """Async version of get_request."""
547
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
548
+ return await _get_request_no_lock_async(request_id)
549
+
550
+
484
551
  @init_db
552
+ @metrics_lib.time_me
485
553
  def create_if_not_exists(request: Request) -> bool:
486
554
  """Create a SkyPilot API request if it does not exist."""
487
555
  with filelock.FileLock(request_lock_path(request.request_id)):
@@ -491,7 +559,19 @@ def create_if_not_exists(request: Request) -> bool:
491
559
  return True
492
560
 
493
561
 
562
+ @init_db_async
563
+ @metrics_lib.time_me_async
564
+ async def create_if_not_exists_async(request: Request) -> bool:
565
+ """Async version of create_if_not_exists."""
566
+ async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
567
+ if await _get_request_no_lock_async(request.request_id) is not None:
568
+ return False
569
+ await _add_or_update_request_no_lock_async(request)
570
+ return True
571
+
572
+
494
573
  @init_db
574
+ @metrics_lib.time_me
495
575
  def get_request_tasks(
496
576
  status: Optional[List[RequestStatus]] = None,
497
577
  cluster_names: Optional[List[str]] = None,
@@ -565,16 +645,15 @@ def get_request_tasks(
565
645
  return requests
566
646
 
567
647
 
568
- @init_db
569
- def get_api_request_ids_start_with(incomplete: str) -> List[str]:
648
+ @init_db_async
649
+ @metrics_lib.time_me_async
650
+ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
570
651
  """Get a list of API request ids for shell completion."""
571
652
  assert _DB is not None
572
- with _DB.conn:
573
- cursor = _DB.conn.cursor()
574
- # Prioritize alive requests (PENDING, RUNNING) over finished ones,
575
- # then order by creation time (newest first) within each category.
576
- cursor.execute(
577
- f"""SELECT request_id FROM {REQUEST_TABLE}
653
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
654
+ # then order by creation time (newest first) within each category.
655
+ async with _DB.execute_fetchall_async(
656
+ f"""SELECT request_id FROM {REQUEST_TABLE}
578
657
  WHERE request_id LIKE ?
579
658
  ORDER BY
580
659
  CASE
@@ -582,21 +661,30 @@ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
582
661
  ELSE 1
583
662
  END,
584
663
  created_at DESC
585
- LIMIT 1000""", (f'{incomplete}%',))
586
- return [row[0] for row in cursor.fetchall()]
664
+ LIMIT 1000""", (f'{incomplete}%',)) as rows:
665
+ if not rows:
666
+ return []
667
+ return [row[0] for row in rows]
668
+
669
+
670
+ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
671
+ f'({", ".join(REQUEST_COLUMNS)}) VALUES '
672
+ f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
587
673
 
588
674
 
589
675
  def _add_or_update_request_no_lock(request: Request):
590
676
  """Add or update a REST request into the database."""
591
- row = request.to_row()
592
- key_str = ', '.join(REQUEST_COLUMNS)
593
- fill_str = ', '.join(['?'] * len(row))
594
677
  assert _DB is not None
595
678
  with _DB.conn:
596
679
  cursor = _DB.conn.cursor()
597
- cursor.execute(
598
- f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
599
- f'VALUES ({fill_str})', row)
680
+ cursor.execute(_add_or_update_request_sql, request.to_row())
681
+
682
+
683
+ async def _add_or_update_request_no_lock_async(request: Request):
684
+ """Async version of _add_or_update_request_no_lock."""
685
+ assert _DB is not None
686
+ await _DB.execute_and_commit_async(_add_or_update_request_sql,
687
+ request.to_row())
600
688
 
601
689
 
602
690
  def set_request_failed(request_id: str, e: BaseException) -> None:
@@ -630,6 +718,7 @@ def set_request_cancelled(request_id: str) -> None:
630
718
 
631
719
 
632
720
  @init_db
721
+ @metrics_lib.time_me
633
722
  def _delete_requests(requests: List[Request]):
634
723
  """Clean up requests by their IDs."""
635
724
  id_list_str = ','.join(repr(req.request_id) for req in requests)
sky/server/server.py CHANGED
@@ -68,6 +68,7 @@ from sky.utils import common_utils
68
68
  from sky.utils import context
69
69
  from sky.utils import context_utils
70
70
  from sky.utils import dag_utils
71
+ from sky.utils import perf_utils
71
72
  from sky.utils import status_lib
72
73
  from sky.utils import subprocess_utils
73
74
  from sky.volumes.server import server as volumes_rest
@@ -421,6 +422,28 @@ async def cleanup_upload_ids():
421
422
  upload_ids_to_cleanup.pop((upload_id, user_hash))
422
423
 
423
424
 
425
+ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
426
+ interval: float = 0.1) -> None:
427
+ target = loop.time() + interval
428
+
429
+ pid = str(os.getpid())
430
+ lag_threshold = perf_utils.get_loop_lag_threshold()
431
+
432
+ def tick():
433
+ nonlocal target
434
+ now = loop.time()
435
+ lag = max(0.0, now - target)
436
+ if lag_threshold is not None and lag > lag_threshold:
437
+ logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
438
+ f'{lag_threshold} seconds.')
439
+ metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
440
+ pid=pid).observe(lag)
441
+ target = now + interval
442
+ loop.call_at(target, tick)
443
+
444
+ loop.call_at(target, tick)
445
+
446
+
424
447
  @contextlib.asynccontextmanager
425
448
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
426
449
  """FastAPI lifespan context manager."""
@@ -446,6 +469,10 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
446
469
  # can safely ignore the error if the task is already scheduled.
447
470
  logger.debug(f'Request {event.id} already exists.')
448
471
  asyncio.create_task(cleanup_upload_ids())
472
+ if metrics.METRICS_ENABLED:
473
+ # Start monitoring the event loop lag in each server worker
474
+ # event loop (process).
475
+ asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
449
476
  yield
450
477
  # Shutdown: Add any cleanup code here if needed
451
478
 
@@ -1254,20 +1281,25 @@ async def download(download_body: payloads.DownloadBody,
1254
1281
  logs_dir_on_api_server).expanduser().resolve() / zip_filename
1255
1282
 
1256
1283
  try:
1257
- folders = [
1258
- str(folder_path.expanduser().resolve())
1259
- for folder_path in folder_paths
1260
- ]
1261
- # Check for optional query parameter to control zip entry structure
1262
- relative = request.query_params.get('relative', 'home')
1263
- if relative == 'items':
1264
- # Dashboard-friendly: entries relative to selected folders
1265
- storage_utils.zip_files_and_folders(folders,
1266
- zip_path,
1267
- relative_to_items=True)
1268
- else:
1269
- # CLI-friendly (default): entries with full paths for mapping
1270
- storage_utils.zip_files_and_folders(folders, zip_path)
1284
+
1285
+ def _zip_files_and_folders(folder_paths, zip_path):
1286
+ folders = [
1287
+ str(folder_path.expanduser().resolve())
1288
+ for folder_path in folder_paths
1289
+ ]
1290
+ # Check for optional query parameter to control zip entry structure
1291
+ relative = request.query_params.get('relative', 'home')
1292
+ if relative == 'items':
1293
+ # Dashboard-friendly: entries relative to selected folders
1294
+ storage_utils.zip_files_and_folders(folders,
1295
+ zip_path,
1296
+ relative_to_items=True)
1297
+ else:
1298
+ # CLI-friendly (default): entries with full paths for mapping
1299
+ storage_utils.zip_files_and_folders(folders, zip_path)
1300
+
1301
+ await context_utils.to_thread(_zip_files_and_folders, folder_paths,
1302
+ zip_path)
1271
1303
 
1272
1304
  # Add home path to the response headers, so that the client can replace
1273
1305
  # the remote path in the zip file to the local path.
@@ -1397,7 +1429,7 @@ async def local_down(request: fastapi.Request) -> None:
1397
1429
  async def api_get(request_id: str) -> payloads.RequestPayload:
1398
1430
  """Gets a request with a given request ID prefix."""
1399
1431
  while True:
1400
- request_task = requests_lib.get_request(request_id)
1432
+ request_task = await requests_lib.get_request_async(request_id)
1401
1433
  if request_task is None:
1402
1434
  print(f'No task with request ID {request_id}', flush=True)
1403
1435
  raise fastapi.HTTPException(
@@ -1486,7 +1518,7 @@ async def stream(
1486
1518
 
1487
1519
  # Original plain text streaming logic
1488
1520
  if request_id is not None:
1489
- request_task = requests_lib.get_request(request_id)
1521
+ request_task = await requests_lib.get_request_async(request_id)
1490
1522
  if request_task is None:
1491
1523
  print(f'No task with request ID {request_id}')
1492
1524
  raise fastapi.HTTPException(
@@ -1581,7 +1613,7 @@ async def api_status(
1581
1613
  else:
1582
1614
  encoded_request_tasks = []
1583
1615
  for request_id in request_ids:
1584
- request_task = requests_lib.get_request(request_id)
1616
+ request_task = await requests_lib.get_request_async(request_id)
1585
1617
  if request_task is None:
1586
1618
  continue
1587
1619
  encoded_request_tasks.append(request_task.readable_encode())
@@ -1791,7 +1823,7 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
1791
1823
 
1792
1824
  @app.get('/api/completion/api_request')
1793
1825
  async def complete_api_request(incomplete: str,) -> List[str]:
1794
- return requests_lib.get_api_request_ids_start_with(incomplete)
1826
+ return await requests_lib.get_api_request_ids_start_with(incomplete)
1795
1827
 
1796
1828
 
1797
1829
  @app.get('/dashboard/{full_path:path}')
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
56
56
  if request_id is not None:
57
57
  status_msg = rich_utils.EncodedStatusMessage(
58
58
  f'[dim]Checking request: {request_id}[/dim]')
59
- request_task = requests_lib.get_request(request_id)
59
+ request_task = await requests_lib.get_request_async(request_id)
60
60
 
61
61
  if request_task is None:
62
62
  raise fastapi.HTTPException(
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
86
86
  # Use smaller padding (1024 bytes) to force browser rendering
87
87
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
88
88
  # Sleep shortly to avoid storming the DB and CPU and allow other
89
- # coroutines to run. This busy waiting loop is performance critical
90
- # for short-running requests, so we do not want to yield too long.
89
+ # coroutines to run.
90
+ # TODO(aylei): we should use a better mechanism to avoid busy
91
+ # polling the DB, which can be a bottleneck for high-concurrency
92
+ # requests.
91
93
  await asyncio.sleep(0.1)
92
- request_task = requests_lib.get_request(request_id)
94
+ request_task = await requests_lib.get_request_async(request_id)
93
95
  if not follow:
94
96
  break
95
97
  if show_request_waiting_spinner:
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
151
153
  line: Optional[bytes] = await f.readline()
152
154
  if not line:
153
155
  if request_id is not None:
154
- request_task = requests_lib.get_request(request_id)
156
+ request_task = await requests_lib.get_request_async(request_id)
155
157
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
158
  if (request_task.status ==
157
159
  requests_lib.RequestStatus.CANCELLED):
sky/server/uvicorn.py CHANGED
@@ -24,6 +24,7 @@ from sky.server.requests import requests as requests_lib
24
24
  from sky.skylet import constants
25
25
  from sky.utils import context_utils
26
26
  from sky.utils import env_options
27
+ from sky.utils import perf_utils
27
28
  from sky.utils import subprocess_utils
28
29
 
29
30
  logger = sky_logging.init_logger(__name__)
@@ -198,6 +199,12 @@ class Server(uvicorn.Server):
198
199
  context_utils.hijack_sys_attrs()
199
200
  # Use default loop policy of uvicorn (use uvloop if available).
200
201
  self.config.setup_event_loop()
202
+ lag_threshold = perf_utils.get_loop_lag_threshold()
203
+ if lag_threshold is not None:
204
+ event_loop = asyncio.get_event_loop()
205
+ # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
206
+ event_loop.set_debug(True)
207
+ event_loop.slow_callback_duration = lag_threshold
201
208
  with self.capture_signals():
202
209
  asyncio.run(self.serve(*args, **kwargs))
203
210
 
@@ -39,7 +39,8 @@ install_requires = [
39
39
  # Light weight requirement, can be replaced with "typing" once
40
40
  # we deprecate Python 3.7 (this will take a while).
41
41
  'typing_extensions',
42
- 'filelock >= 3.6.0',
42
+ # filelock 3.15.0 or higher is required for async file locking.
43
+ 'filelock >= 3.15.0',
43
44
  'packaging',
44
45
  'psutil',
45
46
  'pulp',
@@ -75,6 +76,7 @@ install_requires = [
75
76
  'types-paramiko',
76
77
  'alembic',
77
78
  'aiohttp',
79
+ 'aiosqlite',
78
80
  'anyio',
79
81
  ]
80
82
 
@@ -100,6 +102,7 @@ server_dependencies = [
100
102
  'anyio',
101
103
  GRPC,
102
104
  PROTOBUF,
105
+ 'aiosqlite',
103
106
  ]
104
107
 
105
108
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -505,3 +505,6 @@ COST_REPORT_DEFAULT_DAYS = 30
505
505
 
506
506
  # The directory for file locks.
507
507
  SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
508
+
509
+ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
510
+ 'DEBUG_LOOP_LAG_THRESHOLD_MS')