skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (56) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +130 -40
  3. sky/backends/cloud_vm_ray_backend.py +19 -3
  4. sky/backends/wheel_utils.py +35 -8
  5. sky/clouds/aws.py +118 -1
  6. sky/dashboard/out/404.html +1 -1
  7. sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/global_user_state.py +58 -10
  24. sky/jobs/server/server.py +2 -1
  25. sky/provision/aws/config.py +78 -3
  26. sky/provision/aws/instance.py +45 -6
  27. sky/provision/kubernetes/utils.py +9 -0
  28. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  29. sky/serve/server/server.py +2 -1
  30. sky/server/common.py +1 -2
  31. sky/server/daemons.py +6 -0
  32. sky/server/requests/executor.py +3 -2
  33. sky/server/requests/payloads.py +3 -1
  34. sky/server/requests/preconditions.py +3 -2
  35. sky/server/requests/requests.py +110 -29
  36. sky/server/server.py +70 -61
  37. sky/server/stream_utils.py +7 -5
  38. sky/setup_files/dependencies.py +6 -1
  39. sky/sky_logging.py +28 -0
  40. sky/skylet/constants.py +6 -0
  41. sky/templates/aws-ray.yml.j2 +1 -0
  42. sky/utils/annotations.py +8 -2
  43. sky/utils/cluster_utils.py +3 -3
  44. sky/utils/db/db_utils.py +11 -0
  45. sky/utils/db/migration_utils.py +1 -1
  46. sky/utils/kubernetes_enums.py +1 -0
  47. sky/utils/lock_events.py +94 -0
  48. sky/utils/timeline.py +24 -93
  49. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
  50. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +56 -54
  51. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
  52. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
  53. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
  54. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
  55. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
  56. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
sky/server/daemons.py CHANGED
@@ -7,8 +7,10 @@ from typing import Callable
7
7
  from sky import sky_logging
8
8
  from sky import skypilot_config
9
9
  from sky.server import constants as server_constants
10
+ from sky.utils import annotations
10
11
  from sky.utils import common
11
12
  from sky.utils import env_options
13
+ from sky.utils import timeline
12
14
  from sky.utils import ux_utils
13
15
 
14
16
  logger = sky_logging.init_logger(__name__)
@@ -67,6 +69,10 @@ class InternalRequestDaemon:
67
69
  sky_logging.reload_logger()
68
70
  level = self.refresh_log_level()
69
71
  self.event_fn()
72
+ # Clear request level cache after each run to avoid
73
+ # using too much memory.
74
+ annotations.clear_request_level_cache()
75
+ timeline.save_timeline()
70
76
  except Exception: # pylint: disable=broad-except
71
77
  # It is OK to fail to run the event, as the event is not
72
78
  # critical, but we should log the error.
@@ -383,7 +383,8 @@ def _request_execution_wrapper(request_id: str,
383
383
  # config, as there can be some logs during override that needs to be
384
384
  # captured in the log file.
385
385
  try:
386
- with override_request_env_and_config(request_body, request_id), \
386
+ with sky_logging.add_debug_log_handler(request_id), \
387
+ override_request_env_and_config(request_body, request_id), \
387
388
  tempstore.tempdir():
388
389
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
389
390
  config = skypilot_config.to_dict()
@@ -452,7 +453,7 @@ async def execute_request_coroutine(request: api_requests.Request):
452
453
  **request_body.to_kwargs())
453
454
 
454
455
  async def poll_task(request_id: str) -> bool:
455
- request = api_requests.get_request(request_id)
456
+ request = await api_requests.get_request_async(request_id)
456
457
  if request is None:
457
458
  raise RuntimeError('Request not found')
458
459
 
@@ -71,7 +71,9 @@ EXTERNAL_LOCAL_ENV_VARS = [
71
71
  def request_body_env_vars() -> dict:
72
72
  env_vars = {}
73
73
  for env_var in os.environ:
74
- if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX):
74
+ if (env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX) and
75
+ not env_var.startswith(
76
+ constants.SKYPILOT_SERVER_ENV_VAR_PREFIX)):
75
77
  env_vars[env_var] = os.environ[env_var]
76
78
  if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
77
79
  env_vars[env_var] = os.environ[env_var]
@@ -98,7 +98,7 @@ class Precondition(abc.ABC):
98
98
  return False
99
99
 
100
100
  # Check if the request has been cancelled
101
- request = api_requests.get_request(self.request_id)
101
+ request = await api_requests.get_request_async(self.request_id)
102
102
  if request is None:
103
103
  logger.error(f'Request {self.request_id} not found')
104
104
  return False
@@ -112,7 +112,8 @@ class Precondition(abc.ABC):
112
112
  return True
113
113
  if status_msg is not None and status_msg != last_status_msg:
114
114
  # Update the status message if it has changed.
115
- with api_requests.update_request(self.request_id) as req:
115
+ async with api_requests.update_request_async(
116
+ self.request_id) as req:
116
117
  assert req is not None, self.request_id
117
118
  req.status_msg = status_msg
118
119
  last_status_msg = status_msg
@@ -13,7 +13,8 @@ import sqlite3
13
13
  import threading
14
14
  import time
15
15
  import traceback
16
- from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
16
+ from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
17
+ Optional, Tuple)
17
18
 
18
19
  import colorama
19
20
  import filelock
@@ -402,26 +403,46 @@ _DB = None
402
403
  _init_db_lock = threading.Lock()
403
404
 
404
405
 
406
+ def _init_db_within_lock():
407
+ global _DB
408
+ if _DB is None:
409
+ db_path = os.path.expanduser(
410
+ server_constants.API_SERVER_REQUEST_DB_PATH)
411
+ pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
412
+ _DB = db_utils.SQLiteConn(db_path, create_table)
413
+
414
+
405
415
  def init_db(func):
406
416
  """Initialize the database."""
407
417
 
408
418
  @functools.wraps(func)
409
419
  def wrapper(*args, **kwargs):
410
- global _DB
411
420
  if _DB is not None:
412
421
  return func(*args, **kwargs)
413
422
  with _init_db_lock:
414
- if _DB is None:
415
- db_path = os.path.expanduser(
416
- server_constants.API_SERVER_REQUEST_DB_PATH)
417
- pathlib.Path(db_path).parents[0].mkdir(parents=True,
418
- exist_ok=True)
419
- _DB = db_utils.SQLiteConn(db_path, create_table)
423
+ _init_db_within_lock()
420
424
  return func(*args, **kwargs)
421
425
 
422
426
  return wrapper
423
427
 
424
428
 
429
+ def init_db_async(func):
430
+ """Async version of init_db."""
431
+
432
+ @functools.wraps(func)
433
+ async def wrapper(*args, **kwargs):
434
+ if _DB is not None:
435
+ return await func(*args, **kwargs)
436
+ # If _DB is not initialized, init_db_async will be blocked if there
437
+ # is a thread initializing _DB, this is fine since it occurs on process
438
+ # startup.
439
+ with _init_db_lock:
440
+ _init_db_within_lock()
441
+ return await func(*args, **kwargs)
442
+
443
+ return wrapper
444
+
445
+
425
446
  def reset_db_and_logs():
426
447
  """Create the database."""
427
448
  server_common.clear_local_api_server_database()
@@ -440,28 +461,61 @@ def request_lock_path(request_id: str) -> str:
440
461
  @contextlib.contextmanager
441
462
  @init_db
442
463
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
443
- """Get a SkyPilot API request."""
464
+ """Get and update a SkyPilot API request."""
444
465
  request = _get_request_no_lock(request_id)
445
466
  yield request
446
467
  if request is not None:
447
468
  _add_or_update_request_no_lock(request)
448
469
 
449
470
 
471
+ @init_db
472
+ def update_request_async(
473
+ request_id: str) -> AsyncContextManager[Optional[Request]]:
474
+ """Async version of update_request.
475
+
476
+ Returns an async context manager that yields the request record and
477
+ persists any in-place updates upon exit.
478
+ """
479
+
480
+ @contextlib.asynccontextmanager
481
+ async def _cm():
482
+ request = await _get_request_no_lock_async(request_id)
483
+ try:
484
+ yield request
485
+ finally:
486
+ if request is not None:
487
+ await _add_or_update_request_no_lock_async(request)
488
+
489
+ return _cm()
490
+
491
+
492
+ _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
493
+ 'WHERE request_id LIKE ?')
494
+
495
+
450
496
  def _get_request_no_lock(request_id: str) -> Optional[Request]:
451
497
  """Get a SkyPilot API request."""
452
498
  assert _DB is not None
453
- columns_str = ', '.join(REQUEST_COLUMNS)
454
499
  with _DB.conn:
455
500
  cursor = _DB.conn.cursor()
456
- cursor.execute(
457
- f'SELECT {columns_str} FROM {REQUEST_TABLE} '
458
- 'WHERE request_id LIKE ?', (request_id + '%',))
501
+ cursor.execute(_get_request_sql, (request_id + '%',))
459
502
  row = cursor.fetchone()
460
503
  if row is None:
461
504
  return None
462
505
  return Request.from_row(row)
463
506
 
464
507
 
508
+ async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
509
+ """Async version of _get_request_no_lock."""
510
+ assert _DB is not None
511
+ conn = await _DB.async_conn()
512
+ async with conn.execute(_get_request_sql, (request_id + '%',)) as cursor:
513
+ row = await cursor.fetchone()
514
+ if row is None:
515
+ return None
516
+ return Request.from_row(row)
517
+
518
+
465
519
  @init_db
466
520
  def get_latest_request_id() -> Optional[str]:
467
521
  """Get the latest request ID."""
@@ -481,6 +535,13 @@ def get_request(request_id: str) -> Optional[Request]:
481
535
  return _get_request_no_lock(request_id)
482
536
 
483
537
 
538
+ @init_db_async
539
+ async def get_request_async(request_id: str) -> Optional[Request]:
540
+ """Async version of get_request."""
541
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
542
+ return await _get_request_no_lock_async(request_id)
543
+
544
+
484
545
  @init_db
485
546
  def create_if_not_exists(request: Request) -> bool:
486
547
  """Create a SkyPilot API request if it does not exist."""
@@ -491,6 +552,16 @@ def create_if_not_exists(request: Request) -> bool:
491
552
  return True
492
553
 
493
554
 
555
+ @init_db_async
556
+ async def create_if_not_exists_async(request: Request) -> bool:
557
+ """Async version of create_if_not_exists."""
558
+ async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
559
+ if await _get_request_no_lock_async(request.request_id) is not None:
560
+ return False
561
+ await _add_or_update_request_no_lock_async(request)
562
+ return True
563
+
564
+
494
565
  @init_db
495
566
  def get_request_tasks(
496
567
  status: Optional[List[RequestStatus]] = None,
@@ -565,16 +636,15 @@ def get_request_tasks(
565
636
  return requests
566
637
 
567
638
 
568
- @init_db
569
- def get_api_request_ids_start_with(incomplete: str) -> List[str]:
639
+ @init_db_async
640
+ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
570
641
  """Get a list of API request ids for shell completion."""
571
642
  assert _DB is not None
572
- with _DB.conn:
573
- cursor = _DB.conn.cursor()
574
- # Prioritize alive requests (PENDING, RUNNING) over finished ones,
575
- # then order by creation time (newest first) within each category.
576
- cursor.execute(
577
- f"""SELECT request_id FROM {REQUEST_TABLE}
643
+ conn = await _DB.async_conn()
644
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
645
+ # then order by creation time (newest first) within each category.
646
+ async with conn.execute(
647
+ f"""SELECT request_id FROM {REQUEST_TABLE}
578
648
  WHERE request_id LIKE ?
579
649
  ORDER BY
580
650
  CASE
@@ -582,21 +652,32 @@ def get_api_request_ids_start_with(incomplete: str) -> List[str]:
582
652
  ELSE 1
583
653
  END,
584
654
  created_at DESC
585
- LIMIT 1000""", (f'{incomplete}%',))
586
- return [row[0] for row in cursor.fetchall()]
655
+ LIMIT 1000""", (f'{incomplete}%',)) as cursor:
656
+ rows = await cursor.fetchall()
657
+ if rows is None:
658
+ return []
659
+ return [row[0] for row in rows]
660
+
661
+
662
+ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
663
+ f'({", ".join(REQUEST_COLUMNS)}) VALUES '
664
+ f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
587
665
 
588
666
 
589
667
  def _add_or_update_request_no_lock(request: Request):
590
668
  """Add or update a REST request into the database."""
591
- row = request.to_row()
592
- key_str = ', '.join(REQUEST_COLUMNS)
593
- fill_str = ', '.join(['?'] * len(row))
594
669
  assert _DB is not None
595
670
  with _DB.conn:
596
671
  cursor = _DB.conn.cursor()
597
- cursor.execute(
598
- f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
599
- f'VALUES ({fill_str})', row)
672
+ cursor.execute(_add_or_update_request_sql, request.to_row())
673
+
674
+
675
+ async def _add_or_update_request_no_lock_async(request: Request):
676
+ """Async version of _add_or_update_request_no_lock."""
677
+ assert _DB is not None
678
+ conn = await _DB.async_conn()
679
+ await conn.execute(_add_or_update_request_sql, request.to_row())
680
+ await conn.commit()
600
681
 
601
682
 
602
683
  def set_request_failed(request_id: str, e: BaseException) -> None:
sky/server/server.py CHANGED
@@ -21,6 +21,7 @@ import uuid
21
21
  import zipfile
22
22
 
23
23
  import aiofiles
24
+ import anyio
24
25
  import fastapi
25
26
  from fastapi.middleware import cors
26
27
  import starlette.middleware.base
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
847
848
  client_file_mounts_dir = (
848
849
  common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
849
850
  'file_mounts')
850
- client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
851
+ await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
851
852
 
852
853
  # Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
853
854
  # characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
870
871
  zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
871
872
  else:
872
873
  chunk_dir = client_file_mounts_dir / upload_id
873
- chunk_dir.mkdir(parents=True, exist_ok=True)
874
+ await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
874
875
  zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
875
876
 
876
877
  try:
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
916
917
  await zip_file.write(data)
917
918
 
918
919
  logger.info(f'Uploaded zip file: {zip_file_path}')
919
- unzip_file(zip_file_path, client_file_mounts_dir)
920
+ await unzip_file(zip_file_path, client_file_mounts_dir)
920
921
  if total_chunks > 1:
921
- shutil.rmtree(chunk_dir)
922
+ await context_utils.to_thread(shutil.rmtree, chunk_dir)
922
923
  return payloads.UploadZipFileResponse(
923
924
  status=responses.UploadStatus.COMPLETED.value)
924
925
 
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
933
934
  return False
934
935
 
935
936
 
936
- def unzip_file(zip_file_path: pathlib.Path,
937
- client_file_mounts_dir: pathlib.Path) -> None:
938
- """Unzips a zip file."""
939
- try:
940
- with zipfile.ZipFile(zip_file_path, 'r') as zipf:
941
- for member in zipf.infolist():
942
- # Determine the new path
943
- original_path = os.path.normpath(member.filename)
944
- new_path = client_file_mounts_dir / original_path.lstrip('/')
945
-
946
- if (member.external_attr >> 28) == 0xA:
947
- # Symlink. Read the target path and create a symlink.
937
+ async def unzip_file(zip_file_path: pathlib.Path,
938
+ client_file_mounts_dir: pathlib.Path) -> None:
939
+ """Unzips a zip file without blocking the event loop."""
940
+
941
+ def _do_unzip() -> None:
942
+ try:
943
+ with zipfile.ZipFile(zip_file_path, 'r') as zipf:
944
+ for member in zipf.infolist():
945
+ # Determine the new path
946
+ original_path = os.path.normpath(member.filename)
947
+ new_path = client_file_mounts_dir / original_path.lstrip(
948
+ '/')
949
+
950
+ if (member.external_attr >> 28) == 0xA:
951
+ # Symlink. Read the target path and create a symlink.
952
+ new_path.parent.mkdir(parents=True, exist_ok=True)
953
+ target = zipf.read(member).decode()
954
+ assert not os.path.isabs(target), target
955
+ # Since target is a relative path, we need to check that
956
+ # it is under `client_file_mounts_dir` for security.
957
+ full_target_path = (new_path.parent / target).resolve()
958
+ if not _is_relative_to(full_target_path,
959
+ client_file_mounts_dir):
960
+ raise ValueError(
961
+ f'Symlink target {target} leads to a '
962
+ 'file not in userspace. Aborted.')
963
+
964
+ if new_path.exists() or new_path.is_symlink():
965
+ new_path.unlink(missing_ok=True)
966
+ new_path.symlink_to(
967
+ target,
968
+ target_is_directory=member.filename.endswith('/'))
969
+ continue
970
+
971
+ # Handle directories
972
+ if member.filename.endswith('/'):
973
+ new_path.mkdir(parents=True, exist_ok=True)
974
+ continue
975
+
976
+ # Handle files
948
977
  new_path.parent.mkdir(parents=True, exist_ok=True)
949
- target = zipf.read(member).decode()
950
- assert not os.path.isabs(target), target
951
- # Since target is a relative path, we need to check that it
952
- # is under `client_file_mounts_dir` for security.
953
- full_target_path = (new_path.parent / target).resolve()
954
- if not _is_relative_to(full_target_path,
955
- client_file_mounts_dir):
956
- raise ValueError(f'Symlink target {target} leads to a '
957
- 'file not in userspace. Aborted.')
958
-
959
- if new_path.exists() or new_path.is_symlink():
960
- new_path.unlink(missing_ok=True)
961
- new_path.symlink_to(
962
- target,
963
- target_is_directory=member.filename.endswith('/'))
964
- continue
965
-
966
- # Handle directories
967
- if member.filename.endswith('/'):
968
- new_path.mkdir(parents=True, exist_ok=True)
969
- continue
970
-
971
- # Handle files
972
- new_path.parent.mkdir(parents=True, exist_ok=True)
973
- with zipf.open(member) as member_file, new_path.open('wb') as f:
974
- # Use shutil.copyfileobj to copy files in chunks, so it does
975
- # not load the entire file into memory.
976
- shutil.copyfileobj(member_file, f)
977
- except zipfile.BadZipFile as e:
978
- logger.error(f'Bad zip file: {zip_file_path}')
979
- raise fastapi.HTTPException(
980
- status_code=400,
981
- detail=f'Invalid zip file: {common_utils.format_exception(e)}')
982
- except Exception as e:
983
- logger.error(f'Error unzipping file: {zip_file_path}')
984
- raise fastapi.HTTPException(
985
- status_code=500,
986
- detail=(f'Error unzipping file: '
987
- f'{common_utils.format_exception(e)}'))
978
+ with zipf.open(member) as member_file, new_path.open(
979
+ 'wb') as f:
980
+ # Use shutil.copyfileobj to copy files in chunks,
981
+ # so it does not load the entire file into memory.
982
+ shutil.copyfileobj(member_file, f)
983
+ except zipfile.BadZipFile as e:
984
+ logger.error(f'Bad zip file: {zip_file_path}')
985
+ raise fastapi.HTTPException(
986
+ status_code=400,
987
+ detail=f'Invalid zip file: {common_utils.format_exception(e)}')
988
+ except Exception as e:
989
+ logger.error(f'Error unzipping file: {zip_file_path}')
990
+ raise fastapi.HTTPException(
991
+ status_code=500,
992
+ detail=(f'Error unzipping file: '
993
+ f'{common_utils.format_exception(e)}'))
994
+ finally:
995
+ # Cleanup the temporary file regardless of
996
+ # success/failure handling above
997
+ zip_file_path.unlink(missing_ok=True)
988
998
 
989
- # Cleanup the temporary file
990
- zip_file_path.unlink()
999
+ await context_utils.to_thread(_do_unzip)
991
1000
 
992
1001
 
993
1002
  @app.post('/launch')
@@ -1388,7 +1397,7 @@ async def local_down(request: fastapi.Request) -> None:
1388
1397
  async def api_get(request_id: str) -> payloads.RequestPayload:
1389
1398
  """Gets a request with a given request ID prefix."""
1390
1399
  while True:
1391
- request_task = requests_lib.get_request(request_id)
1400
+ request_task = await requests_lib.get_request_async(request_id)
1392
1401
  if request_task is None:
1393
1402
  print(f'No task with request ID {request_id}', flush=True)
1394
1403
  raise fastapi.HTTPException(
@@ -1477,7 +1486,7 @@ async def stream(
1477
1486
 
1478
1487
  # Original plain text streaming logic
1479
1488
  if request_id is not None:
1480
- request_task = requests_lib.get_request(request_id)
1489
+ request_task = await requests_lib.get_request_async(request_id)
1481
1490
  if request_task is None:
1482
1491
  print(f'No task with request ID {request_id}')
1483
1492
  raise fastapi.HTTPException(
@@ -1572,7 +1581,7 @@ async def api_status(
1572
1581
  else:
1573
1582
  encoded_request_tasks = []
1574
1583
  for request_id in request_ids:
1575
- request_task = requests_lib.get_request(request_id)
1584
+ request_task = await requests_lib.get_request_async(request_id)
1576
1585
  if request_task is None:
1577
1586
  continue
1578
1587
  encoded_request_tasks.append(request_task.readable_encode())
@@ -1782,7 +1791,7 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
1782
1791
 
1783
1792
  @app.get('/api/completion/api_request')
1784
1793
  async def complete_api_request(incomplete: str,) -> List[str]:
1785
- return requests_lib.get_api_request_ids_start_with(incomplete)
1794
+ return await requests_lib.get_api_request_ids_start_with(incomplete)
1786
1795
 
1787
1796
 
1788
1797
  @app.get('/dashboard/{full_path:path}')
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
56
56
  if request_id is not None:
57
57
  status_msg = rich_utils.EncodedStatusMessage(
58
58
  f'[dim]Checking request: {request_id}[/dim]')
59
- request_task = requests_lib.get_request(request_id)
59
+ request_task = await requests_lib.get_request_async(request_id)
60
60
 
61
61
  if request_task is None:
62
62
  raise fastapi.HTTPException(
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
86
86
  # Use smaller padding (1024 bytes) to force browser rendering
87
87
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
88
88
  # Sleep shortly to avoid storming the DB and CPU and allow other
89
- # coroutines to run. This busy waiting loop is performance critical
90
- # for short-running requests, so we do not want to yield too long.
89
+ # coroutines to run.
90
+ # TODO(aylei): we should use a better mechanism to avoid busy
91
+ # polling the DB, which can be a bottleneck for high-concurrency
92
+ # requests.
91
93
  await asyncio.sleep(0.1)
92
- request_task = requests_lib.get_request(request_id)
94
+ request_task = await requests_lib.get_request_async(request_id)
93
95
  if not follow:
94
96
  break
95
97
  if show_request_waiting_spinner:
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
151
153
  line: Optional[bytes] = await f.readline()
152
154
  if not line:
153
155
  if request_id is not None:
154
- request_task = requests_lib.get_request(request_id)
156
+ request_task = await requests_lib.get_request_async(request_id)
155
157
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
158
  if (request_task.status ==
157
159
  requests_lib.RequestStatus.CANCELLED):
@@ -35,7 +35,8 @@ install_requires = [
35
35
  # Light weight requirement, can be replaced with "typing" once
36
36
  # we deprecate Python 3.7 (this will take a while).
37
37
  'typing_extensions',
38
- 'filelock >= 3.6.0',
38
+ # filelock 3.15.0 or higher is required for async file locking.
39
+ 'filelock >= 3.15.0',
39
40
  'packaging',
40
41
  'psutil',
41
42
  'pulp',
@@ -71,6 +72,8 @@ install_requires = [
71
72
  'types-paramiko',
72
73
  'alembic',
73
74
  'aiohttp',
75
+ 'aiosqlite',
76
+ 'anyio',
74
77
  ]
75
78
 
76
79
  # See requirements-dev.txt for the version of grpc and protobuf
@@ -92,8 +95,10 @@ server_dependencies = [
92
95
  'passlib',
93
96
  'pyjwt',
94
97
  'aiohttp',
98
+ 'anyio',
95
99
  GRPC,
96
100
  PROTOBUF,
101
+ 'aiosqlite',
97
102
  ]
98
103
 
99
104
  local_ray = [
sky/sky_logging.py CHANGED
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
19
19
  _DATE_FORMAT = '%m-%d %H:%M:%S'
20
20
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
21
21
 
22
+ _DEBUG_LOG_DIR = os.path.expanduser(
23
+ os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
24
+
22
25
  DEBUG = logging.DEBUG
23
26
  INFO = logging.INFO
24
27
  WARNING = logging.WARNING
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
254
257
  log_path = os.path.expanduser(os.path.join(log_dir, file_name))
255
258
 
256
259
  return log_path
260
+
261
+
262
+ @contextlib.contextmanager
263
+ def add_debug_log_handler(request_id: str):
264
+ if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
265
+ yield
266
+ return
267
+
268
+ os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
269
+ log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
270
+ try:
271
+ debug_log_handler = logging.FileHandler(log_path)
272
+ debug_log_handler.setFormatter(FORMATTER)
273
+ debug_log_handler.setLevel(logging.DEBUG)
274
+ _root_logger.addHandler(debug_log_handler)
275
+ # sky.provision sets up its own logger/handler with propogate=False,
276
+ # so add it there too.
277
+ provision_logger = logging.getLogger('sky.provision')
278
+ provision_logger.addHandler(debug_log_handler)
279
+ provision_logger.setLevel(logging.DEBUG)
280
+ yield
281
+ finally:
282
+ _root_logger.removeHandler(debug_log_handler)
283
+ provision_logger.removeHandler(debug_log_handler)
284
+ debug_log_handler.close()
sky/skylet/constants.py CHANGED
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
70
70
 
71
71
  # Prefix for SkyPilot environment variables
72
72
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
73
+ SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
73
74
 
74
75
  # The name for the environment variable that stores the unique ID of the
75
76
  # current task. This will stay the same across multiple recoveries of the
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
417
418
  # Path to the generated cluster config yamls and ssh configs.
418
419
  SKY_USER_FILE_PATH = '~/.sky/generated'
419
420
 
421
+ # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
420
422
  # Environment variable that is set to 'true' if this is a skypilot server.
421
423
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
422
424
 
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
436
438
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
437
439
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
438
440
 
441
+ # Enable debug logging for requests.
442
+ ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
443
+ f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
444
+
439
445
  SKYPILOT_DEFAULT_WORKSPACE = 'default'
440
446
 
441
447
  # BEGIN constants used for service catalog.
@@ -48,6 +48,7 @@ provider:
48
48
  # The upper-level SkyPilot code has make sure there will not be resource
49
49
  # leakage.
50
50
  disable_launch_config_check: true
51
+ max_efa_interfaces: {{max_efa_interfaces}}
51
52
 
52
53
  auth:
53
54
  ssh_user: {{ssh_user}}
sky/utils/annotations.py CHANGED
@@ -7,7 +7,7 @@ from typing_extensions import ParamSpec
7
7
 
8
8
  # Whether the current process is a SkyPilot API server process.
9
9
  is_on_api_server = True
10
- FUNCTIONS_NEED_RELOAD_CACHE = []
10
+ _FUNCTIONS_NEED_RELOAD_CACHE = []
11
11
 
12
12
  T = TypeVar('T')
13
13
  P = ParamSpec('P')
@@ -50,7 +50,13 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
50
50
  else:
51
51
  cached_func = functools.lru_cache(*lru_cache_args,
52
52
  **lru_cache_kwargs)(func)
53
- FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
53
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
54
54
  return cached_func
55
55
 
56
56
  return decorator
57
+
58
+
59
+ def clear_request_level_cache():
60
+ """Clear the request-level cache."""
61
+ for func in _FUNCTIONS_NEED_RELOAD_CACHE:
62
+ func.cache_clear()