skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +64 -0
  3. sky/backends/backend_utils.py +11 -11
  4. sky/backends/cloud_vm_ray_backend.py +15 -4
  5. sky/client/cli/command.py +39 -10
  6. sky/client/cli/flags.py +4 -2
  7. sky/client/sdk.py +26 -3
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  12. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/storage.py +2 -2
  46. sky/global_user_state.py +137 -37
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +4 -2
  49. sky/jobs/server/server.py +21 -12
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +248 -144
  52. sky/provision/kubernetes/network.py +9 -6
  53. sky/provision/provisioner.py +8 -0
  54. sky/schemas/api/responses.py +2 -0
  55. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  56. sky/serve/server/server.py +8 -7
  57. sky/server/common.py +10 -15
  58. sky/server/constants.py +1 -1
  59. sky/server/daemons.py +4 -2
  60. sky/server/requests/executor.py +30 -28
  61. sky/server/requests/payloads.py +5 -1
  62. sky/server/requests/preconditions.py +9 -4
  63. sky/server/requests/requests.py +130 -53
  64. sky/server/requests/serializers/encoders.py +3 -3
  65. sky/server/server.py +91 -58
  66. sky/server/stream_utils.py +127 -38
  67. sky/server/uvicorn.py +18 -17
  68. sky/setup_files/alembic.ini +4 -0
  69. sky/skylet/services.py +5 -5
  70. sky/skypilot_config.py +87 -75
  71. sky/ssh_node_pools/server.py +4 -4
  72. sky/users/permission.py +4 -0
  73. sky/utils/asyncio_utils.py +63 -3
  74. sky/utils/db/db_utils.py +11 -3
  75. sky/utils/db/migration_utils.py +7 -3
  76. sky/volumes/server/server.py +3 -3
  77. sky/workspaces/server.py +6 -6
  78. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
  79. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
  80. sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  82. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  83. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  87. sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
  88. /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  89. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  90. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  92. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -17,7 +17,6 @@ import time
17
17
  import typing
18
18
  from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
19
  Tuple, TypeVar, Union)
20
- from urllib import parse
21
20
  import uuid
22
21
 
23
22
  import cachetools
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
342
341
  @annotations.lru_cache(scope='global')
343
342
  def get_dashboard_url(server_url: str,
344
343
  starting_page: Optional[str] = None) -> str:
345
- # The server_url may include username or password with the
346
- # format of https://username:password@example.com:8080/path
347
- # We need to remove the username and password and only
348
- # return `https://example.com:8080/path`
349
- parsed = parse.urlparse(server_url)
350
- # Reconstruct the URL without credentials but keeping the scheme
351
- dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
352
- if parsed.port:
353
- dashboard_url = f'{dashboard_url}:{parsed.port}'
354
- if parsed.path:
355
- dashboard_url = f'{dashboard_url}{parsed.path}'
356
- dashboard_url = dashboard_url.rstrip('/')
344
+ dashboard_url = server_url.rstrip('/')
357
345
  dashboard_url = f'{dashboard_url}/dashboard'
358
346
  if starting_page:
359
347
  dashboard_url = f'{dashboard_url}/{starting_page}'
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
490
478
  def handle_request_error(response: 'requests.Response') -> None:
491
479
  # Keep the original HTTPError if the response code >= 400
492
480
  response.raise_for_status()
481
+
493
482
  # Other status codes are not expected neither, e.g. we do not expect to
494
483
  # handle redirection here.
495
484
  if response.status_code != 200:
@@ -916,12 +905,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
916
905
  client_command: Optional[str],
917
906
  using_remote_api_server: bool, user: 'models.User',
918
907
  request_id: str) -> None:
919
- """Reload modules, global variables, and usage message for a new request."""
908
+ """Reload modules, global variables, and usage message for a new request.
909
+
910
+ Must be called within the request's context.
911
+ """
920
912
  # This should be called first to make sure the logger is up-to-date.
921
913
  sky_logging.reload_logger()
922
914
 
923
915
  # Reload the skypilot config to make sure the latest config is used.
924
- skypilot_config.safe_reload_config()
916
+ # We don't need to grab the lock here because this function is only
917
+ # run once we are inside the request's context, so there shouldn't
918
+ # be any race conditions when reloading the config.
919
+ skypilot_config.reload_config()
925
920
 
926
921
  # Reset the client entrypoint and command for the usage message.
927
922
  common_utils.set_request_context(
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 20
13
+ API_VERSION = 21
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py CHANGED
@@ -38,9 +38,11 @@ class InternalRequestDaemon:
38
38
  try:
39
39
  # Refresh config within the while loop.
40
40
  # Since this is a long running daemon,
41
- # reload_config_for_new_request()
41
+ # reload_for_new_request()
42
42
  # is not called in between the event runs.
43
- skypilot_config.safe_reload_config()
43
+ # We don't need to grab the lock here because each of the daemons
44
+ # run in their own process and thus have their own request context.
45
+ skypilot_config.reload_config()
44
46
  # Get the configured log level for the daemon inside the event loop
45
47
  # in case the log level changes after the API server is started.
46
48
  level_str = skypilot_config.get_nested(
@@ -214,10 +214,11 @@ class RequestWorker:
214
214
  time.sleep(0.1)
215
215
  return
216
216
  request_id, ignore_return_value, _ = request_element
217
- request = api_requests.get_request(request_id)
217
+ request = api_requests.get_request(request_id, fields=['status'])
218
218
  assert request is not None, f'Request with ID {request_id} is None'
219
219
  if request.status == api_requests.RequestStatus.CANCELLED:
220
220
  return
221
+ del request
221
222
  logger.info(f'[{self}] Submitting request: {request_id}')
222
223
  # Start additional process to run the request, so that it can be
223
224
  # cancelled when requested by a user.
@@ -328,10 +329,7 @@ def override_request_env_and_config(
328
329
  # through the execution.
329
330
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
330
331
  name=request_body.env_vars[constants.USER_ENV_VAR])
331
- global_user_state.add_or_update_user(user)
332
- # Refetch the user to get the latest user info, including the created_at
333
- # field.
334
- user = global_user_state.get_user(user.id)
332
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
335
333
 
336
334
  # Force color to be enabled.
337
335
  os.environ['CLICOLOR_FORCE'] = '1'
@@ -621,8 +619,8 @@ async def _execute_request_coroutine(request: api_requests.Request):
621
619
  logger.info(f'Executing request {request.request_id} in coroutine')
622
620
  func = request.entrypoint
623
621
  request_body = request.request_body
624
- with api_requests.update_request(request.request_id) as request_task:
625
- request_task.status = api_requests.RequestStatus.RUNNING
622
+ await api_requests.update_status_async(request.request_id,
623
+ api_requests.RequestStatus.RUNNING)
626
624
  # Redirect stdout and stderr to the request log path.
627
625
  original_output = ctx.redirect_log(request.log_path)
628
626
  try:
@@ -632,7 +630,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
632
630
  **request_body.to_kwargs())
633
631
  except Exception as e: # pylint: disable=broad-except
634
632
  ctx.redirect_log(original_output)
635
- api_requests.set_request_failed(request.request_id, e)
633
+ await api_requests.set_request_failed_async(request.request_id, e)
636
634
  logger.error(f'Failed to run request {request.request_id} due to '
637
635
  f'{common_utils.format_exception(e)}')
638
636
  return
@@ -649,14 +647,15 @@ async def _execute_request_coroutine(request: api_requests.Request):
649
647
  if fut.done():
650
648
  try:
651
649
  result = await fut
652
- api_requests.set_request_succeeded(request_id, result)
650
+ await api_requests.set_request_succeeded_async(
651
+ request_id, result)
653
652
  except asyncio.CancelledError:
654
653
  # The task is cancelled by ctx.cancel(), where the status
655
654
  # should already be set to CANCELLED.
656
655
  pass
657
656
  except Exception as e: # pylint: disable=broad-except
658
657
  ctx.redirect_log(original_output)
659
- api_requests.set_request_failed(request_id, e)
658
+ await api_requests.set_request_failed_async(request_id, e)
660
659
  logger.error(f'Request {request_id} failed due to '
661
660
  f'{common_utils.format_exception(e)}')
662
661
  return True
@@ -671,13 +670,13 @@ async def _execute_request_coroutine(request: api_requests.Request):
671
670
  except asyncio.CancelledError:
672
671
  # Current coroutine is cancelled due to client disconnect, set the
673
672
  # request status for consistency.
674
- api_requests.set_request_cancelled(request.request_id)
673
+ await api_requests.set_request_cancelled_async(request.request_id)
675
674
  pass
676
675
  # pylint: disable=broad-except
677
676
  except (Exception, KeyboardInterrupt, SystemExit) as e:
678
677
  # Handle any other error
679
678
  ctx.redirect_log(original_output)
680
- api_requests.set_request_failed(request.request_id, e)
679
+ await api_requests.set_request_failed_async(request.request_id, e)
681
680
  logger.error(f'Request {request.request_id} interrupted due to '
682
681
  f'unhandled exception: {common_utils.format_exception(e)}')
683
682
  raise
@@ -687,7 +686,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
687
686
  ctx.cancel()
688
687
 
689
688
 
690
- def prepare_request(
689
+ async def prepare_request_async(
691
690
  request_id: str,
692
691
  request_name: str,
693
692
  request_body: payloads.RequestBody,
@@ -713,7 +712,7 @@ def prepare_request(
713
712
  user_id=user_id,
714
713
  cluster_name=request_cluster_name)
715
714
 
716
- if not api_requests.create_if_not_exists(request):
715
+ if not await api_requests.create_if_not_exists_async(request):
717
716
  raise exceptions.RequestAlreadyExistsError(
718
717
  f'Request {request_id} already exists.')
719
718
 
@@ -721,17 +720,18 @@ def prepare_request(
721
720
  return request
722
721
 
723
722
 
724
- def schedule_request(request_id: str,
725
- request_name: str,
726
- request_body: payloads.RequestBody,
727
- func: Callable[P, Any],
728
- request_cluster_name: Optional[str] = None,
729
- ignore_return_value: bool = False,
730
- schedule_type: api_requests.ScheduleType = (
731
- api_requests.ScheduleType.LONG),
732
- is_skypilot_system: bool = False,
733
- precondition: Optional[preconditions.Precondition] = None,
734
- retryable: bool = False) -> None:
723
+ async def schedule_request_async(request_id: str,
724
+ request_name: str,
725
+ request_body: payloads.RequestBody,
726
+ func: Callable[P, Any],
727
+ request_cluster_name: Optional[str] = None,
728
+ ignore_return_value: bool = False,
729
+ schedule_type: api_requests.ScheduleType = (
730
+ api_requests.ScheduleType.LONG),
731
+ is_skypilot_system: bool = False,
732
+ precondition: Optional[
733
+ preconditions.Precondition] = None,
734
+ retryable: bool = False) -> None:
735
735
  """Enqueue a request to the request queue.
736
736
 
737
737
  Args:
@@ -752,9 +752,11 @@ def schedule_request(request_id: str,
752
752
  The precondition is waited asynchronously and does not block the
753
753
  caller.
754
754
  """
755
- request_task = prepare_request(request_id, request_name, request_body, func,
756
- request_cluster_name, schedule_type,
757
- is_skypilot_system)
755
+ request_task = await prepare_request_async(request_id, request_name,
756
+ request_body, func,
757
+ request_cluster_name,
758
+ schedule_type,
759
+ is_skypilot_system)
758
760
  schedule_prepared_request(request_task, ignore_return_value, precondition,
759
761
  retryable)
760
762
 
@@ -363,9 +363,10 @@ class CancelBody(RequestBody):
363
363
  return kwargs
364
364
 
365
365
 
366
- class ClusterNameBody(RequestBody):
366
+ class ProvisionLogsBody(RequestBody):
367
367
  """Cluster node."""
368
368
  cluster_name: str
369
+ worker: Optional[int] = None
369
370
 
370
371
 
371
372
  class ClusterJobBody(RequestBody):
@@ -541,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
541
542
  page: Optional[int] = None
542
543
  limit: Optional[int] = None
543
544
  statuses: Optional[List[str]] = None
545
+ # The fields to return in the response.
546
+ # Refer to the fields in the `class ManagedJobRecord` in `response.py`
547
+ fields: Optional[List[str]] = None
544
548
 
545
549
 
546
550
  class JobsCancelBody(RequestBody):
@@ -90,7 +90,7 @@ class Precondition(abc.ABC):
90
90
  while True:
91
91
  if self.timeout > 0 and time.time() - start_time > self.timeout:
92
92
  # Cancel the request on timeout.
93
- api_requests.set_request_failed(
93
+ await api_requests.set_request_failed_async(
94
94
  self.request_id,
95
95
  exceptions.RequestCancelled(
96
96
  f'Request {self.request_id} precondition wait timed '
@@ -98,13 +98,15 @@ class Precondition(abc.ABC):
98
98
  return False
99
99
 
100
100
  # Check if the request has been cancelled
101
- request = await api_requests.get_request_async(self.request_id)
101
+ request = await api_requests.get_request_async(self.request_id,
102
+ fields=['status'])
102
103
  if request is None:
103
104
  logger.error(f'Request {self.request_id} not found')
104
105
  return False
105
106
  if request.status == api_requests.RequestStatus.CANCELLED:
106
107
  logger.debug(f'Request {self.request_id} cancelled')
107
108
  return False
109
+ del request
108
110
 
109
111
  try:
110
112
  met, status_msg = await self.check()
@@ -116,7 +118,7 @@ class Precondition(abc.ABC):
116
118
  self.request_id, status_msg)
117
119
  last_status_msg = status_msg
118
120
  except (Exception, SystemExit, KeyboardInterrupt) as e: # pylint: disable=broad-except
119
- api_requests.set_request_failed(self.request_id, e)
121
+ await api_requests.set_request_failed_async(self.request_id, e)
120
122
  logger.info(f'Request {self.request_id} failed due to '
121
123
  f'{common_utils.format_exception(e)}')
122
124
  return False
@@ -166,7 +168,10 @@ class ClusterStartCompletePrecondition(Precondition):
166
168
  api_requests.RequestStatus.RUNNING
167
169
  ],
168
170
  include_request_names=['sky.launch', 'sky.start'],
169
- cluster_names=[self.cluster_name]))
171
+ cluster_names=[self.cluster_name],
172
+ # Only get the request ID to avoid fetching the whole request.
173
+ # We're only interested in the count, not the whole request.
174
+ fields=['request_id']))
170
175
  if len(requests) == 0:
171
176
  # No running or pending tasks, the start process is done.
172
177
  return True, None
@@ -16,6 +16,7 @@ import time
16
16
  import traceback
17
17
  from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
18
18
  Tuple)
19
+ import uuid
19
20
 
20
21
  import anyio
21
22
  import colorama
@@ -293,6 +294,11 @@ class Request:
293
294
  raise
294
295
 
295
296
 
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
300
+
301
+
296
302
  def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
297
303
  """Serialize the SkyPilot API request for display purposes.
298
304
 
@@ -400,7 +406,8 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
400
406
  for request_task in get_request_tasks(req_filter=RequestTaskFilter(
401
407
  status=[RequestStatus.PENDING, RequestStatus.RUNNING],
402
408
  exclude_request_names=[exclude_request_name],
403
- cluster_names=[cluster_name]))
409
+ cluster_names=[cluster_name],
410
+ fields=['request_id']))
404
411
  ]
405
412
  kill_requests(request_ids)
406
413
 
@@ -425,7 +432,8 @@ def kill_requests(request_ids: Optional[List[str]] = None,
425
432
  status=[RequestStatus.PENDING, RequestStatus.RUNNING],
426
433
  # Avoid cancelling the cancel request itself.
427
434
  exclude_request_names=['sky.api_cancel'],
428
- user_id=user_id))
435
+ user_id=user_id,
436
+ fields=['request_id']))
429
437
  ]
430
438
  cancelled_request_ids = []
431
439
  for request_id in request_ids:
@@ -592,6 +600,18 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
592
600
  _add_or_update_request_no_lock(request)
593
601
 
594
602
 
603
+ @init_db
604
+ @metrics_lib.time_me
605
+ @asyncio_utils.shield
606
+ async def update_status_async(request_id: str, status: RequestStatus) -> None:
607
+ """Update the status of a request"""
608
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
609
+ request = await _get_request_no_lock_async(request_id)
610
+ if request is not None:
611
+ request.status = status
612
+ await _add_or_update_request_no_lock_async(request)
613
+
614
+
595
615
  @init_db
596
616
  @metrics_lib.time_me
597
617
  @asyncio_utils.shield
@@ -604,62 +624,75 @@ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
604
624
  await _add_or_update_request_no_lock_async(request)
605
625
 
606
626
 
607
- _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
608
- 'WHERE request_id LIKE ?')
609
-
610
-
611
- def _get_request_no_lock(request_id: str) -> Optional[Request]:
627
+ def _get_request_no_lock(
628
+ request_id: str,
629
+ fields: Optional[List[str]] = None) -> Optional[Request]:
612
630
  """Get a SkyPilot API request."""
613
631
  assert _DB is not None
632
+ columns_str = ', '.join(REQUEST_COLUMNS)
633
+ if fields:
634
+ columns_str = ', '.join(fields)
614
635
  with _DB.conn:
615
636
  cursor = _DB.conn.cursor()
616
- cursor.execute(_get_request_sql, (request_id + '%',))
637
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
638
+ 'WHERE request_id LIKE ?'), (request_id + '%',))
617
639
  row = cursor.fetchone()
618
640
  if row is None:
619
641
  return None
642
+ if fields:
643
+ row = _update_request_row_fields(row, fields)
620
644
  return Request.from_row(row)
621
645
 
622
646
 
623
- async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
647
+ async def _get_request_no_lock_async(
648
+ request_id: str,
649
+ fields: Optional[List[str]] = None) -> Optional[Request]:
624
650
  """Async version of _get_request_no_lock."""
625
651
  assert _DB is not None
626
- async with _DB.execute_fetchall_async(_get_request_sql,
627
- (request_id + '%',)) as rows:
652
+ columns_str = ', '.join(REQUEST_COLUMNS)
653
+ if fields:
654
+ columns_str = ', '.join(fields)
655
+ async with _DB.execute_fetchall_async(
656
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
657
+ 'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
628
658
  row = rows[0] if rows else None
629
659
  if row is None:
630
660
  return None
661
+ if fields:
662
+ row = _update_request_row_fields(row, fields)
631
663
  return Request.from_row(row)
632
664
 
633
665
 
634
- @init_db
666
+ @init_db_async
635
667
  @metrics_lib.time_me
636
- def get_latest_request_id() -> Optional[str]:
668
+ async def get_latest_request_id_async() -> Optional[str]:
637
669
  """Get the latest request ID."""
638
670
  assert _DB is not None
639
- with _DB.conn:
640
- cursor = _DB.conn.cursor()
641
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
642
- 'ORDER BY created_at DESC LIMIT 1')
643
- row = cursor.fetchone()
644
- return row[0] if row else None
671
+ async with _DB.execute_fetchall_async(
672
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
673
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
674
+ return rows[0][0] if rows else None
645
675
 
646
676
 
647
677
  @init_db
648
678
  @metrics_lib.time_me
649
- def get_request(request_id: str) -> Optional[Request]:
679
+ def get_request(request_id: str,
680
+ fields: Optional[List[str]] = None) -> Optional[Request]:
650
681
  """Get a SkyPilot API request."""
651
682
  with filelock.FileLock(request_lock_path(request_id)):
652
- return _get_request_no_lock(request_id)
683
+ return _get_request_no_lock(request_id, fields)
653
684
 
654
685
 
655
686
  @init_db_async
656
687
  @metrics_lib.time_me_async
657
688
  @asyncio_utils.shield
658
- async def get_request_async(request_id: str) -> Optional[Request]:
689
+ async def get_request_async(
690
+ request_id: str,
691
+ fields: Optional[List[str]] = None) -> Optional[Request]:
659
692
  """Async version of get_request."""
660
693
  # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
661
694
  async with filelock.AsyncFileLock(request_lock_path(request_id)):
662
- return await _get_request_no_lock_async(request_id)
695
+ return await _get_request_no_lock_async(request_id, fields)
663
696
 
664
697
 
665
698
  class StatusWithMsg(NamedTuple):
@@ -696,17 +729,6 @@ async def get_request_status_async(
696
729
  return StatusWithMsg(status, status_msg)
697
730
 
698
731
 
699
- @init_db
700
- @metrics_lib.time_me
701
- def create_if_not_exists(request: Request) -> bool:
702
- """Create a SkyPilot API request if it does not exist."""
703
- with filelock.FileLock(request_lock_path(request.request_id)):
704
- if _get_request_no_lock(request.request_id) is not None:
705
- return False
706
- _add_or_update_request_no_lock(request)
707
- return True
708
-
709
-
710
732
  @init_db_async
711
733
  @metrics_lib.time_me_async
712
734
  @asyncio_utils.shield
@@ -896,6 +918,23 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
896
918
  request_task.set_error(e)
897
919
 
898
920
 
921
+ @init_db_async
922
+ @metrics_lib.time_me_async
923
+ @asyncio_utils.shield
924
+ async def set_request_failed_async(request_id: str, e: BaseException) -> None:
925
+ """Set a request to failed and populate the error message."""
926
+ with ux_utils.enable_traceback():
927
+ stacktrace = traceback.format_exc()
928
+ setattr(e, 'stacktrace', stacktrace)
929
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
930
+ request_task = await _get_request_no_lock_async(request_id)
931
+ assert request_task is not None, request_id
932
+ request_task.status = RequestStatus.FAILED
933
+ request_task.finished_at = time.time()
934
+ request_task.set_error(e)
935
+ await _add_or_update_request_no_lock_async(request_task)
936
+
937
+
899
938
  def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
900
939
  """Set a request to succeeded and populate the result."""
901
940
  with update_request(request_id) as request_task:
@@ -906,28 +945,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
906
945
  request_task.set_return_value(result)
907
946
 
908
947
 
909
- def set_request_cancelled(request_id: str) -> None:
948
+ @init_db_async
949
+ @metrics_lib.time_me_async
950
+ @asyncio_utils.shield
951
+ async def set_request_succeeded_async(request_id: str,
952
+ result: Optional[Any]) -> None:
953
+ """Set a request to succeeded and populate the result."""
954
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
955
+ request_task = await _get_request_no_lock_async(request_id)
956
+ assert request_task is not None, request_id
957
+ request_task.status = RequestStatus.SUCCEEDED
958
+ request_task.finished_at = time.time()
959
+ if result is not None:
960
+ request_task.set_return_value(result)
961
+ await _add_or_update_request_no_lock_async(request_task)
962
+
963
+
964
+ @init_db_async
965
+ @metrics_lib.time_me_async
966
+ @asyncio_utils.shield
967
+ async def set_request_cancelled_async(request_id: str) -> None:
910
968
  """Set a pending or running request to cancelled."""
911
- with update_request(request_id) as request_task:
969
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
970
+ request_task = await _get_request_no_lock_async(request_id)
912
971
  assert request_task is not None, request_id
913
972
  # Already finished or cancelled.
914
973
  if request_task.status > RequestStatus.RUNNING:
915
974
  return
916
975
  request_task.finished_at = time.time()
917
976
  request_task.status = RequestStatus.CANCELLED
977
+ await _add_or_update_request_no_lock_async(request_task)
918
978
 
919
979
 
920
980
  @init_db
921
981
  @metrics_lib.time_me
922
- async def _delete_requests(requests: List[Request]):
982
+ async def _delete_requests(request_ids: List[str]):
923
983
  """Clean up requests by their IDs."""
924
- id_list_str = ','.join(repr(req.request_id) for req in requests)
984
+ id_list_str = ','.join(repr(request_id) for request_id in request_ids)
925
985
  assert _DB is not None
926
986
  await _DB.execute_and_commit_async(
927
987
  f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
928
988
 
929
989
 
930
- async def clean_finished_requests_with_retention(retention_seconds: int):
990
+ async def clean_finished_requests_with_retention(retention_seconds: int,
991
+ batch_size: int = 1000):
931
992
  """Clean up finished requests older than the retention period.
932
993
 
933
994
  This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
@@ -936,24 +997,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
936
997
  Args:
937
998
  retention_seconds: Requests older than this many seconds will be
938
999
  deleted.
1000
+ batch_size: batch delete 'batch_size' requests at a time to
1001
+ avoid using too much memory and once and to let each
1002
+ db query complete in a reasonable time. All stale
1003
+ requests older than the retention period will be deleted
1004
+ regardless of the batch size.
939
1005
  """
940
- reqs = await get_request_tasks_async(
941
- req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
942
- finished_before=time.time() -
943
- retention_seconds))
944
-
945
- futs = []
946
- for req in reqs:
947
- futs.append(
948
- asyncio.create_task(
949
- anyio.Path(req.log_path.absolute()).unlink(missing_ok=True)))
950
- await asyncio.gather(*futs)
951
-
952
- await _delete_requests(reqs)
1006
+ total_deleted = 0
1007
+ while True:
1008
+ reqs = await get_request_tasks_async(
1009
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
1010
+ finished_before=time.time() -
1011
+ retention_seconds,
1012
+ limit=batch_size,
1013
+ fields=['request_id']))
1014
+ if len(reqs) == 0:
1015
+ break
1016
+ futs = []
1017
+ for req in reqs:
1018
+ # req.log_path is derived from request_id,
1019
+ # so it's ok to just grab the request_id in the above query.
1020
+ futs.append(
1021
+ asyncio.create_task(
1022
+ anyio.Path(
1023
+ req.log_path.absolute()).unlink(missing_ok=True)))
1024
+ await asyncio.gather(*futs)
1025
+
1026
+ await _delete_requests([req.request_id for req in reqs])
1027
+ total_deleted += len(reqs)
1028
+ if len(reqs) < batch_size:
1029
+ break
953
1030
 
954
1031
  # To avoid leakage of the log file, logs must be deleted before the
955
1032
  # request task in the database.
956
- logger.info(f'Cleaned up {len(reqs)} finished requests '
1033
+ logger.info(f'Cleaned up {total_deleted} finished requests '
957
1034
  f'older than {retention_seconds} seconds')
958
1035
 
959
1036
 
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
121
121
  encoded_cluster = dataclasses.asdict(cluster)
122
122
  encoded_cluster['status'] = encoded_cluster['status'].value
123
123
  encoded_unmanaged_clusters.append(encoded_cluster)
124
- all_jobs = [job.model_dump() for job in all_jobs]
124
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
125
125
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
126
126
 
127
127
 
@@ -151,9 +151,9 @@ def encode_jobs_queue_v2(
151
151
  for job in jobs:
152
152
  job['status'] = job['status'].value
153
153
  if total is None:
154
- return [job.model_dump() for job in jobs]
154
+ return [job.model_dump(by_alias=True) for job in jobs]
155
155
  return {
156
- 'jobs': [job.model_dump() for job in jobs],
156
+ 'jobs': [job.model_dump(by_alias=True) for job in jobs],
157
157
  'total': total,
158
158
  'total_no_filter': total_no_filter,
159
159
  'status_counts': status_counts