skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (86) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -10
  3. sky/authentication.py +1 -1
  4. sky/backends/backend.py +3 -5
  5. sky/backends/backend_utils.py +140 -52
  6. sky/backends/cloud_vm_ray_backend.py +30 -25
  7. sky/backends/local_docker_backend.py +3 -8
  8. sky/backends/wheel_utils.py +35 -8
  9. sky/client/cli/command.py +41 -9
  10. sky/client/sdk.py +23 -8
  11. sky/client/sdk_async.py +6 -2
  12. sky/clouds/aws.py +118 -1
  13. sky/core.py +1 -4
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/global_user_state.py +82 -22
  31. sky/jobs/client/sdk.py +5 -2
  32. sky/jobs/recovery_strategy.py +9 -4
  33. sky/jobs/server/server.py +2 -1
  34. sky/logs/agent.py +2 -2
  35. sky/logs/aws.py +6 -3
  36. sky/provision/aws/config.py +78 -3
  37. sky/provision/aws/instance.py +45 -6
  38. sky/provision/do/utils.py +2 -1
  39. sky/provision/kubernetes/instance.py +55 -11
  40. sky/provision/kubernetes/utils.py +11 -2
  41. sky/provision/nebius/utils.py +36 -2
  42. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  43. sky/serve/client/impl.py +5 -4
  44. sky/serve/replica_managers.py +4 -3
  45. sky/serve/serve_utils.py +2 -2
  46. sky/serve/server/impl.py +3 -2
  47. sky/serve/server/server.py +2 -1
  48. sky/server/auth/oauth2_proxy.py +10 -4
  49. sky/server/common.py +4 -4
  50. sky/server/daemons.py +16 -5
  51. sky/server/requests/executor.py +5 -3
  52. sky/server/requests/payloads.py +3 -1
  53. sky/server/requests/preconditions.py +3 -2
  54. sky/server/requests/requests.py +121 -19
  55. sky/server/server.py +85 -60
  56. sky/server/stream_utils.py +7 -5
  57. sky/setup_files/dependencies.py +6 -1
  58. sky/sky_logging.py +28 -0
  59. sky/skylet/constants.py +6 -0
  60. sky/skylet/events.py +2 -3
  61. sky/skypilot_config.py +10 -10
  62. sky/task.py +1 -1
  63. sky/templates/aws-ray.yml.j2 +1 -0
  64. sky/templates/nebius-ray.yml.j2 +4 -8
  65. sky/usage/usage_lib.py +3 -2
  66. sky/utils/annotations.py +8 -2
  67. sky/utils/cluster_utils.py +3 -3
  68. sky/utils/common_utils.py +0 -72
  69. sky/utils/controller_utils.py +4 -3
  70. sky/utils/dag_utils.py +4 -4
  71. sky/utils/db/db_utils.py +11 -0
  72. sky/utils/db/migration_utils.py +1 -1
  73. sky/utils/kubernetes/config_map_utils.py +3 -3
  74. sky/utils/kubernetes_enums.py +1 -0
  75. sky/utils/lock_events.py +94 -0
  76. sky/utils/schemas.py +3 -0
  77. sky/utils/timeline.py +24 -93
  78. sky/utils/yaml_utils.py +77 -10
  79. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
  80. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
  81. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
  82. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
  83. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
  84. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
  85. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
  86. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,8 @@ import sqlite3
13
13
  import threading
14
14
  import time
15
15
  import traceback
16
- from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
16
+ from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
17
+ Optional, Tuple)
17
18
 
18
19
  import colorama
19
20
  import filelock
@@ -402,26 +403,46 @@ _DB = None
402
403
  _init_db_lock = threading.Lock()
403
404
 
404
405
 
406
+ def _init_db_within_lock():
407
+ global _DB
408
+ if _DB is None:
409
+ db_path = os.path.expanduser(
410
+ server_constants.API_SERVER_REQUEST_DB_PATH)
411
+ pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
412
+ _DB = db_utils.SQLiteConn(db_path, create_table)
413
+
414
+
405
415
  def init_db(func):
406
416
  """Initialize the database."""
407
417
 
408
418
  @functools.wraps(func)
409
419
  def wrapper(*args, **kwargs):
410
- global _DB
411
420
  if _DB is not None:
412
421
  return func(*args, **kwargs)
413
422
  with _init_db_lock:
414
- if _DB is None:
415
- db_path = os.path.expanduser(
416
- server_constants.API_SERVER_REQUEST_DB_PATH)
417
- pathlib.Path(db_path).parents[0].mkdir(parents=True,
418
- exist_ok=True)
419
- _DB = db_utils.SQLiteConn(db_path, create_table)
423
+ _init_db_within_lock()
420
424
  return func(*args, **kwargs)
421
425
 
422
426
  return wrapper
423
427
 
424
428
 
429
+ def init_db_async(func):
430
+ """Async version of init_db."""
431
+
432
+ @functools.wraps(func)
433
+ async def wrapper(*args, **kwargs):
434
+ if _DB is not None:
435
+ return await func(*args, **kwargs)
436
+ # If _DB is not initialized, init_db_async will be blocked if there
437
+ # is a thread initializing _DB, this is fine since it occurs on process
438
+ # startup.
439
+ with _init_db_lock:
440
+ _init_db_within_lock()
441
+ return await func(*args, **kwargs)
442
+
443
+ return wrapper
444
+
445
+
425
446
  def reset_db_and_logs():
426
447
  """Create the database."""
427
448
  server_common.clear_local_api_server_database()
@@ -440,28 +461,61 @@ def request_lock_path(request_id: str) -> str:
440
461
  @contextlib.contextmanager
441
462
  @init_db
442
463
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
443
- """Get a SkyPilot API request."""
464
+ """Get and update a SkyPilot API request."""
444
465
  request = _get_request_no_lock(request_id)
445
466
  yield request
446
467
  if request is not None:
447
468
  _add_or_update_request_no_lock(request)
448
469
 
449
470
 
471
+ @init_db
472
+ def update_request_async(
473
+ request_id: str) -> AsyncContextManager[Optional[Request]]:
474
+ """Async version of update_request.
475
+
476
+ Returns an async context manager that yields the request record and
477
+ persists any in-place updates upon exit.
478
+ """
479
+
480
+ @contextlib.asynccontextmanager
481
+ async def _cm():
482
+ request = await _get_request_no_lock_async(request_id)
483
+ try:
484
+ yield request
485
+ finally:
486
+ if request is not None:
487
+ await _add_or_update_request_no_lock_async(request)
488
+
489
+ return _cm()
490
+
491
+
492
+ _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
493
+ 'WHERE request_id LIKE ?')
494
+
495
+
450
496
  def _get_request_no_lock(request_id: str) -> Optional[Request]:
451
497
  """Get a SkyPilot API request."""
452
498
  assert _DB is not None
453
- columns_str = ', '.join(REQUEST_COLUMNS)
454
499
  with _DB.conn:
455
500
  cursor = _DB.conn.cursor()
456
- cursor.execute(
457
- f'SELECT {columns_str} FROM {REQUEST_TABLE} '
458
- 'WHERE request_id LIKE ?', (request_id + '%',))
501
+ cursor.execute(_get_request_sql, (request_id + '%',))
459
502
  row = cursor.fetchone()
460
503
  if row is None:
461
504
  return None
462
505
  return Request.from_row(row)
463
506
 
464
507
 
508
+ async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
509
+ """Async version of _get_request_no_lock."""
510
+ assert _DB is not None
511
+ conn = await _DB.async_conn()
512
+ async with conn.execute(_get_request_sql, (request_id + '%',)) as cursor:
513
+ row = await cursor.fetchone()
514
+ if row is None:
515
+ return None
516
+ return Request.from_row(row)
517
+
518
+
465
519
  @init_db
466
520
  def get_latest_request_id() -> Optional[str]:
467
521
  """Get the latest request ID."""
@@ -481,6 +535,13 @@ def get_request(request_id: str) -> Optional[Request]:
481
535
  return _get_request_no_lock(request_id)
482
536
 
483
537
 
538
+ @init_db_async
539
+ async def get_request_async(request_id: str) -> Optional[Request]:
540
+ """Async version of get_request."""
541
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
542
+ return await _get_request_no_lock_async(request_id)
543
+
544
+
484
545
  @init_db
485
546
  def create_if_not_exists(request: Request) -> bool:
486
547
  """Create a SkyPilot API request if it does not exist."""
@@ -491,6 +552,16 @@ def create_if_not_exists(request: Request) -> bool:
491
552
  return True
492
553
 
493
554
 
555
+ @init_db_async
556
+ async def create_if_not_exists_async(request: Request) -> bool:
557
+ """Async version of create_if_not_exists."""
558
+ async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
559
+ if await _get_request_no_lock_async(request.request_id) is not None:
560
+ return False
561
+ await _add_or_update_request_no_lock_async(request)
562
+ return True
563
+
564
+
494
565
  @init_db
495
566
  def get_request_tasks(
496
567
  status: Optional[List[RequestStatus]] = None,
@@ -565,17 +636,48 @@ def get_request_tasks(
565
636
  return requests
566
637
 
567
638
 
639
+ @init_db_async
640
+ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
641
+ """Get a list of API request ids for shell completion."""
642
+ assert _DB is not None
643
+ conn = await _DB.async_conn()
644
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
645
+ # then order by creation time (newest first) within each category.
646
+ async with conn.execute(
647
+ f"""SELECT request_id FROM {REQUEST_TABLE}
648
+ WHERE request_id LIKE ?
649
+ ORDER BY
650
+ CASE
651
+ WHEN status IN ('PENDING', 'RUNNING') THEN 0
652
+ ELSE 1
653
+ END,
654
+ created_at DESC
655
+ LIMIT 1000""", (f'{incomplete}%',)) as cursor:
656
+ rows = await cursor.fetchall()
657
+ if rows is None:
658
+ return []
659
+ return [row[0] for row in rows]
660
+
661
+
662
+ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
663
+ f'({", ".join(REQUEST_COLUMNS)}) VALUES '
664
+ f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
665
+
666
+
568
667
  def _add_or_update_request_no_lock(request: Request):
569
668
  """Add or update a REST request into the database."""
570
- row = request.to_row()
571
- key_str = ', '.join(REQUEST_COLUMNS)
572
- fill_str = ', '.join(['?'] * len(row))
573
669
  assert _DB is not None
574
670
  with _DB.conn:
575
671
  cursor = _DB.conn.cursor()
576
- cursor.execute(
577
- f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
578
- f'VALUES ({fill_str})', row)
672
+ cursor.execute(_add_or_update_request_sql, request.to_row())
673
+
674
+
675
+ async def _add_or_update_request_no_lock_async(request: Request):
676
+ """Async version of _add_or_update_request_no_lock."""
677
+ assert _DB is not None
678
+ conn = await _DB.async_conn()
679
+ await conn.execute(_add_or_update_request_sql, request.to_row())
680
+ await conn.commit()
579
681
 
580
682
 
581
683
  def set_request_failed(request_id: str, e: BaseException) -> None:
sky/server/server.py CHANGED
@@ -21,6 +21,7 @@ import uuid
21
21
  import zipfile
22
22
 
23
23
  import aiofiles
24
+ import anyio
24
25
  import fastapi
25
26
  from fastapi.middleware import cors
26
27
  import starlette.middleware.base
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
847
848
  client_file_mounts_dir = (
848
849
  common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
849
850
  'file_mounts')
850
- client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
851
+ await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
851
852
 
852
853
  # Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
853
854
  # characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
870
871
  zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
871
872
  else:
872
873
  chunk_dir = client_file_mounts_dir / upload_id
873
- chunk_dir.mkdir(parents=True, exist_ok=True)
874
+ await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
874
875
  zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
875
876
 
876
877
  try:
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
916
917
  await zip_file.write(data)
917
918
 
918
919
  logger.info(f'Uploaded zip file: {zip_file_path}')
919
- unzip_file(zip_file_path, client_file_mounts_dir)
920
+ await unzip_file(zip_file_path, client_file_mounts_dir)
920
921
  if total_chunks > 1:
921
- shutil.rmtree(chunk_dir)
922
+ await context_utils.to_thread(shutil.rmtree, chunk_dir)
922
923
  return payloads.UploadZipFileResponse(
923
924
  status=responses.UploadStatus.COMPLETED.value)
924
925
 
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
933
934
  return False
934
935
 
935
936
 
936
- def unzip_file(zip_file_path: pathlib.Path,
937
- client_file_mounts_dir: pathlib.Path) -> None:
938
- """Unzips a zip file."""
939
- try:
940
- with zipfile.ZipFile(zip_file_path, 'r') as zipf:
941
- for member in zipf.infolist():
942
- # Determine the new path
943
- original_path = os.path.normpath(member.filename)
944
- new_path = client_file_mounts_dir / original_path.lstrip('/')
945
-
946
- if (member.external_attr >> 28) == 0xA:
947
- # Symlink. Read the target path and create a symlink.
937
+ async def unzip_file(zip_file_path: pathlib.Path,
938
+ client_file_mounts_dir: pathlib.Path) -> None:
939
+ """Unzips a zip file without blocking the event loop."""
940
+
941
+ def _do_unzip() -> None:
942
+ try:
943
+ with zipfile.ZipFile(zip_file_path, 'r') as zipf:
944
+ for member in zipf.infolist():
945
+ # Determine the new path
946
+ original_path = os.path.normpath(member.filename)
947
+ new_path = client_file_mounts_dir / original_path.lstrip(
948
+ '/')
949
+
950
+ if (member.external_attr >> 28) == 0xA:
951
+ # Symlink. Read the target path and create a symlink.
952
+ new_path.parent.mkdir(parents=True, exist_ok=True)
953
+ target = zipf.read(member).decode()
954
+ assert not os.path.isabs(target), target
955
+ # Since target is a relative path, we need to check that
956
+ # it is under `client_file_mounts_dir` for security.
957
+ full_target_path = (new_path.parent / target).resolve()
958
+ if not _is_relative_to(full_target_path,
959
+ client_file_mounts_dir):
960
+ raise ValueError(
961
+ f'Symlink target {target} leads to a '
962
+ 'file not in userspace. Aborted.')
963
+
964
+ if new_path.exists() or new_path.is_symlink():
965
+ new_path.unlink(missing_ok=True)
966
+ new_path.symlink_to(
967
+ target,
968
+ target_is_directory=member.filename.endswith('/'))
969
+ continue
970
+
971
+ # Handle directories
972
+ if member.filename.endswith('/'):
973
+ new_path.mkdir(parents=True, exist_ok=True)
974
+ continue
975
+
976
+ # Handle files
948
977
  new_path.parent.mkdir(parents=True, exist_ok=True)
949
- target = zipf.read(member).decode()
950
- assert not os.path.isabs(target), target
951
- # Since target is a relative path, we need to check that it
952
- # is under `client_file_mounts_dir` for security.
953
- full_target_path = (new_path.parent / target).resolve()
954
- if not _is_relative_to(full_target_path,
955
- client_file_mounts_dir):
956
- raise ValueError(f'Symlink target {target} leads to a '
957
- 'file not in userspace. Aborted.')
958
-
959
- if new_path.exists() or new_path.is_symlink():
960
- new_path.unlink(missing_ok=True)
961
- new_path.symlink_to(
962
- target,
963
- target_is_directory=member.filename.endswith('/'))
964
- continue
965
-
966
- # Handle directories
967
- if member.filename.endswith('/'):
968
- new_path.mkdir(parents=True, exist_ok=True)
969
- continue
970
-
971
- # Handle files
972
- new_path.parent.mkdir(parents=True, exist_ok=True)
973
- with zipf.open(member) as member_file, new_path.open('wb') as f:
974
- # Use shutil.copyfileobj to copy files in chunks, so it does
975
- # not load the entire file into memory.
976
- shutil.copyfileobj(member_file, f)
977
- except zipfile.BadZipFile as e:
978
- logger.error(f'Bad zip file: {zip_file_path}')
979
- raise fastapi.HTTPException(
980
- status_code=400,
981
- detail=f'Invalid zip file: {common_utils.format_exception(e)}')
982
- except Exception as e:
983
- logger.error(f'Error unzipping file: {zip_file_path}')
984
- raise fastapi.HTTPException(
985
- status_code=500,
986
- detail=(f'Error unzipping file: '
987
- f'{common_utils.format_exception(e)}'))
978
+ with zipf.open(member) as member_file, new_path.open(
979
+ 'wb') as f:
980
+ # Use shutil.copyfileobj to copy files in chunks,
981
+ # so it does not load the entire file into memory.
982
+ shutil.copyfileobj(member_file, f)
983
+ except zipfile.BadZipFile as e:
984
+ logger.error(f'Bad zip file: {zip_file_path}')
985
+ raise fastapi.HTTPException(
986
+ status_code=400,
987
+ detail=f'Invalid zip file: {common_utils.format_exception(e)}')
988
+ except Exception as e:
989
+ logger.error(f'Error unzipping file: {zip_file_path}')
990
+ raise fastapi.HTTPException(
991
+ status_code=500,
992
+ detail=(f'Error unzipping file: '
993
+ f'{common_utils.format_exception(e)}'))
994
+ finally:
995
+ # Cleanup the temporary file regardless of
996
+ # success/failure handling above
997
+ zip_file_path.unlink(missing_ok=True)
988
998
 
989
- # Cleanup the temporary file
990
- zip_file_path.unlink()
999
+ await context_utils.to_thread(_do_unzip)
991
1000
 
992
1001
 
993
1002
  @app.post('/launch')
@@ -1388,7 +1397,7 @@ async def local_down(request: fastapi.Request) -> None:
1388
1397
  async def api_get(request_id: str) -> payloads.RequestPayload:
1389
1398
  """Gets a request with a given request ID prefix."""
1390
1399
  while True:
1391
- request_task = requests_lib.get_request(request_id)
1400
+ request_task = await requests_lib.get_request_async(request_id)
1392
1401
  if request_task is None:
1393
1402
  print(f'No task with request ID {request_id}', flush=True)
1394
1403
  raise fastapi.HTTPException(
@@ -1403,6 +1412,9 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1403
1412
  raise fastapi.HTTPException(
1404
1413
  status_code=500, detail=request_task.encode().model_dump())
1405
1414
  return request_task.encode()
1415
+ elif (request_task.status == requests_lib.RequestStatus.RUNNING and
1416
+ daemons.is_daemon_request_id(request_id)):
1417
+ return request_task.encode()
1406
1418
  # yield control to allow other coroutines to run, sleep shortly
1407
1419
  # to avoid storming the DB and CPU in the meantime
1408
1420
  await asyncio.sleep(0.1)
@@ -1474,7 +1486,7 @@ async def stream(
1474
1486
 
1475
1487
  # Original plain text streaming logic
1476
1488
  if request_id is not None:
1477
- request_task = requests_lib.get_request(request_id)
1489
+ request_task = await requests_lib.get_request_async(request_id)
1478
1490
  if request_task is None:
1479
1491
  print(f'No task with request ID {request_id}')
1480
1492
  raise fastapi.HTTPException(
@@ -1491,6 +1503,14 @@ async def stream(
1491
1503
  if log_path == constants.API_SERVER_LOGS:
1492
1504
  resolved_log_path = pathlib.Path(
1493
1505
  constants.API_SERVER_LOGS).expanduser()
1506
+ if not resolved_log_path.exists():
1507
+ raise fastapi.HTTPException(
1508
+ status_code=404,
1509
+ detail='Server log file does not exist. The API server may '
1510
+ 'have been started with `--foreground` - check the '
1511
+ 'stdout of API server process, such as: '
1512
+ '`kubectl logs -n api-server-namespace '
1513
+ 'api-server-pod-name`')
1494
1514
  else:
1495
1515
  # This should be a log path under ~/sky_logs.
1496
1516
  resolved_logs_directory = pathlib.Path(
@@ -1561,7 +1581,7 @@ async def api_status(
1561
1581
  else:
1562
1582
  encoded_request_tasks = []
1563
1583
  for request_id in request_ids:
1564
- request_task = requests_lib.get_request(request_id)
1584
+ request_task = await requests_lib.get_request_async(request_id)
1565
1585
  if request_task is None:
1566
1586
  continue
1567
1587
  encoded_request_tasks.append(request_task.readable_encode())
@@ -1769,6 +1789,11 @@ async def complete_volume_name(incomplete: str,) -> List[str]:
1769
1789
  return global_user_state.get_volume_names_start_with(incomplete)
1770
1790
 
1771
1791
 
1792
+ @app.get('/api/completion/api_request')
1793
+ async def complete_api_request(incomplete: str,) -> List[str]:
1794
+ return await requests_lib.get_api_request_ids_start_with(incomplete)
1795
+
1796
+
1772
1797
  @app.get('/dashboard/{full_path:path}')
1773
1798
  async def serve_dashboard(full_path: str):
1774
1799
  """Serves the Next.js dashboard application.
@@ -56,7 +56,7 @@ async def log_streamer(request_id: Optional[str],
56
56
  if request_id is not None:
57
57
  status_msg = rich_utils.EncodedStatusMessage(
58
58
  f'[dim]Checking request: {request_id}[/dim]')
59
- request_task = requests_lib.get_request(request_id)
59
+ request_task = await requests_lib.get_request_async(request_id)
60
60
 
61
61
  if request_task is None:
62
62
  raise fastapi.HTTPException(
@@ -86,10 +86,12 @@ async def log_streamer(request_id: Optional[str],
86
86
  # Use smaller padding (1024 bytes) to force browser rendering
87
87
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
88
88
  # Sleep shortly to avoid storming the DB and CPU and allow other
89
- # coroutines to run. This busy waiting loop is performance critical
90
- # for short-running requests, so we do not want to yield too long.
89
+ # coroutines to run.
90
+ # TODO(aylei): we should use a better mechanism to avoid busy
91
+ # polling the DB, which can be a bottleneck for high-concurrency
92
+ # requests.
91
93
  await asyncio.sleep(0.1)
92
- request_task = requests_lib.get_request(request_id)
94
+ request_task = await requests_lib.get_request_async(request_id)
93
95
  if not follow:
94
96
  break
95
97
  if show_request_waiting_spinner:
@@ -151,7 +153,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
151
153
  line: Optional[bytes] = await f.readline()
152
154
  if not line:
153
155
  if request_id is not None:
154
- request_task = requests_lib.get_request(request_id)
156
+ request_task = await requests_lib.get_request_async(request_id)
155
157
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
158
  if (request_task.status ==
157
159
  requests_lib.RequestStatus.CANCELLED):
@@ -35,7 +35,8 @@ install_requires = [
35
35
  # Light weight requirement, can be replaced with "typing" once
36
36
  # we deprecate Python 3.7 (this will take a while).
37
37
  'typing_extensions',
38
- 'filelock >= 3.6.0',
38
+ # filelock 3.15.0 or higher is required for async file locking.
39
+ 'filelock >= 3.15.0',
39
40
  'packaging',
40
41
  'psutil',
41
42
  'pulp',
@@ -71,6 +72,8 @@ install_requires = [
71
72
  'types-paramiko',
72
73
  'alembic',
73
74
  'aiohttp',
75
+ 'aiosqlite',
76
+ 'anyio',
74
77
  ]
75
78
 
76
79
  # See requirements-dev.txt for the version of grpc and protobuf
@@ -92,8 +95,10 @@ server_dependencies = [
92
95
  'passlib',
93
96
  'pyjwt',
94
97
  'aiohttp',
98
+ 'anyio',
95
99
  GRPC,
96
100
  PROTOBUF,
101
+ 'aiosqlite',
97
102
  ]
98
103
 
99
104
  local_ray = [
sky/sky_logging.py CHANGED
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
19
19
  _DATE_FORMAT = '%m-%d %H:%M:%S'
20
20
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
21
21
 
22
+ _DEBUG_LOG_DIR = os.path.expanduser(
23
+ os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
24
+
22
25
  DEBUG = logging.DEBUG
23
26
  INFO = logging.INFO
24
27
  WARNING = logging.WARNING
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
254
257
  log_path = os.path.expanduser(os.path.join(log_dir, file_name))
255
258
 
256
259
  return log_path
260
+
261
+
262
+ @contextlib.contextmanager
263
+ def add_debug_log_handler(request_id: str):
264
+ if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
265
+ yield
266
+ return
267
+
268
+ os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
269
+ log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
270
+ try:
271
+ debug_log_handler = logging.FileHandler(log_path)
272
+ debug_log_handler.setFormatter(FORMATTER)
273
+ debug_log_handler.setLevel(logging.DEBUG)
274
+ _root_logger.addHandler(debug_log_handler)
275
+ # sky.provision sets up its own logger/handler with propogate=False,
276
+ # so add it there too.
277
+ provision_logger = logging.getLogger('sky.provision')
278
+ provision_logger.addHandler(debug_log_handler)
279
+ provision_logger.setLevel(logging.DEBUG)
280
+ yield
281
+ finally:
282
+ _root_logger.removeHandler(debug_log_handler)
283
+ provision_logger.removeHandler(debug_log_handler)
284
+ debug_log_handler.close()
sky/skylet/constants.py CHANGED
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
70
70
 
71
71
  # Prefix for SkyPilot environment variables
72
72
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
73
+ SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
73
74
 
74
75
  # The name for the environment variable that stores the unique ID of the
75
76
  # current task. This will stay the same across multiple recoveries of the
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
417
418
  # Path to the generated cluster config yamls and ssh configs.
418
419
  SKY_USER_FILE_PATH = '~/.sky/generated'
419
420
 
421
+ # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
420
422
  # Environment variable that is set to 'true' if this is a skypilot server.
421
423
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
422
424
 
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
436
438
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
437
439
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
438
440
 
441
+ # Enable debug logging for requests.
442
+ ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
443
+ f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
444
+
439
445
  SKYPILOT_DEFAULT_WORKSPACE = 'default'
440
446
 
441
447
  # BEGIN constants used for service catalog.
sky/skylet/events.py CHANGED
@@ -20,7 +20,6 @@ from sky.skylet import constants
20
20
  from sky.skylet import job_lib
21
21
  from sky.usage import usage_lib
22
22
  from sky.utils import cluster_utils
23
- from sky.utils import common_utils
24
23
  from sky.utils import registry
25
24
  from sky.utils import ux_utils
26
25
  from sky.utils import yaml_utils
@@ -181,7 +180,7 @@ class AutostopEvent(SkyletEvent):
181
180
 
182
181
  config_path = os.path.abspath(
183
182
  os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
184
- config = common_utils.read_yaml(config_path)
183
+ config = yaml_utils.read_yaml(config_path)
185
184
  provider_name = cluster_utils.get_provider_name(config)
186
185
  cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
187
186
  assert cloud is not None, f'Unknown cloud: {provider_name}'
@@ -326,5 +325,5 @@ class AutostopEvent(SkyletEvent):
326
325
  config['auth'].pop('ssh_proxy_command', None)
327
326
  # Empty the file_mounts.
328
327
  config['file_mounts'] = {}
329
- common_utils.dump_yaml(yaml_path, config)
328
+ yaml_utils.dump_yaml(yaml_path, config)
330
329
  logger.debug('Replaced upscaling speed to 0.')