skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -9,11 +9,13 @@ import json
9
9
  import os
10
10
  import pathlib
11
11
  import re
12
+ import shutil
12
13
  import subprocess
13
14
  import sys
15
+ import tempfile
14
16
  import time
15
17
  import typing
16
- from typing import Any, Dict, Literal, Optional, Tuple
18
+ from typing import Any, Dict, Literal, Optional, Tuple, Union
17
19
  from urllib import parse
18
20
  import uuid
19
21
 
@@ -27,6 +29,7 @@ from sky import skypilot_config
27
29
  from sky.adaptors import common as adaptors_common
28
30
  from sky.data import data_utils
29
31
  from sky.server import constants as server_constants
32
+ from sky.server import rest
30
33
  from sky.skylet import constants
31
34
  from sky.usage import usage_lib
32
35
  from sky.utils import annotations
@@ -240,9 +243,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
240
243
  server_url = endpoint if endpoint is not None else get_server_url()
241
244
  while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
242
245
  try:
243
- response = requests.get(f'{server_url}/api/health',
244
- timeout=2.5,
245
- cookies=get_api_cookie_jar())
246
+ response = rest.get(f'{server_url}/api/health',
247
+ timeout=2.5,
248
+ cookies=get_api_cookie_jar())
246
249
  except requests.exceptions.Timeout:
247
250
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
248
251
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
@@ -327,6 +330,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
327
330
  def _start_api_server(deploy: bool = False,
328
331
  host: str = '127.0.0.1',
329
332
  foreground: bool = False,
333
+ metrics: bool = False,
334
+ metrics_port: Optional[int] = None,
330
335
  enable_basic_auth: bool = False):
331
336
  """Starts a SkyPilot API server locally."""
332
337
  server_url = get_server_url(host)
@@ -357,10 +362,13 @@ def _start_api_server(deploy: bool = False,
357
362
  args += ['--deploy']
358
363
  if host is not None:
359
364
  args += [f'--host={host}']
365
+ if metrics_port is not None:
366
+ args += [f'--metrics-port={metrics_port}']
360
367
 
361
368
  if foreground:
362
369
  # Replaces the current process with the API server
363
370
  os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
371
+ _set_metrics_env_var(os.environ, metrics, deploy)
364
372
  if enable_basic_auth:
365
373
  os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
366
374
  os.execvp(args[0], args)
@@ -368,6 +376,10 @@ def _start_api_server(deploy: bool = False,
368
376
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
369
377
  os.makedirs(os.path.dirname(log_path), exist_ok=True)
370
378
 
379
+ # For spawn mode, copy the environ to avoid polluting the SDK process.
380
+ server_env = os.environ.copy()
381
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
382
+ _set_metrics_env_var(server_env, metrics, deploy)
371
383
  # Start the API server process in the background and don't wait for it.
372
384
  # If this is called from a CLI invocation, we need
373
385
  # start_new_session=True so that SIGINT on the CLI will not also kill
@@ -437,6 +449,26 @@ def _start_api_server(deploy: bool = False,
437
449
  f'SkyPilot API server started. {dashboard_msg}'))
438
450
 
439
451
 
452
+ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
453
+ deploy: bool):
454
+ """Sets the metrics environment variables.
455
+
456
+ Args:
457
+ env: The environment variables to set.
458
+ metrics: Whether to enable metrics.
459
+ deploy: Whether the server is running in deploy mode, which means
460
+ multiple processes might be running.
461
+ """
462
+ if metrics:
463
+ env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
464
+ if deploy:
465
+ metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
466
+ shutil.rmtree(metrics_dir, ignore_errors=True)
467
+ os.makedirs(metrics_dir, exist_ok=True)
468
+ # Refer to https://prometheus.github.io/client_python/multiprocess/
469
+ env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
470
+
471
+
440
472
  def check_server_healthy(
441
473
  endpoint: Optional[str] = None
442
474
  ) -> Tuple[Literal[
@@ -571,6 +603,8 @@ def get_skypilot_version_on_disk() -> str:
571
603
  def check_server_healthy_or_start_fn(deploy: bool = False,
572
604
  host: str = '127.0.0.1',
573
605
  foreground: bool = False,
606
+ metrics: bool = False,
607
+ metrics_port: Optional[int] = None,
574
608
  enable_basic_auth: bool = False):
575
609
  api_server_status = None
576
610
  try:
@@ -592,7 +626,8 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
592
626
  # have started the server while we were waiting for the lock.
593
627
  api_server_info = get_api_server_status(endpoint)
594
628
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
595
- _start_api_server(deploy, host, foreground, enable_basic_auth)
629
+ _start_api_server(deploy, host, foreground, metrics,
630
+ metrics_port, enable_basic_auth)
596
631
 
597
632
 
598
633
  def check_server_healthy_or_start(func):
sky/server/constants.py CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
7
7
  # API server version, whenever there is a change in API server that requires a
8
8
  # restart of the local API server or error out when the client does not match
9
9
  # the server version.
10
- API_VERSION = '9'
10
+ API_VERSION = '10'
11
11
 
12
12
  # Prefix for API request names.
13
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -22,6 +22,10 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
22
22
  # background.
23
23
  CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
24
 
25
+ # The interval (seconds) for the volume status to be refreshed in the
26
+ # background.
27
+ VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
28
+
25
29
  # Environment variable for a file path to the API cookie file.
26
30
  # Keep in sync with websocket_proxy.py
27
31
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
sky/server/metrics.py ADDED
@@ -0,0 +1,105 @@
1
+ """Instrumentation for the API server."""
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+
7
+ import fastapi
8
+ from prometheus_client import generate_latest
9
+ from prometheus_client import multiprocess
10
+ import prometheus_client as prom
11
+ import starlette.middleware.base
12
+ import uvicorn
13
+
14
+ from sky import sky_logging
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ # Total number of API server requests, grouped by path, method, and status.
19
+ sky_apiserver_requests_total = prom.Counter(
20
+ 'sky_apiserver_requests_total',
21
+ 'Total number of API server requests',
22
+ ['path', 'method', 'status'],
23
+ )
24
+
25
+ # Time spent processing API server requests, grouped by path, method, and
26
+ # status.
27
+ sky_apiserver_request_duration_seconds = prom.Histogram(
28
+ 'sky_apiserver_request_duration_seconds',
29
+ 'Time spent processing API server requests',
30
+ ['path', 'method', 'status'],
31
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
32
+ float('inf')),
33
+ )
34
+
35
+ metrics_app = fastapi.FastAPI()
36
+
37
+
38
+ @metrics_app.get('/metrics')
39
+ async def metrics() -> fastapi.Response:
40
+ """Expose aggregated Prometheus metrics from all worker processes."""
41
+ if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
42
+ # In multiprocess mode, we need to collect metrics from all processes.
43
+ registry = prom.CollectorRegistry()
44
+ multiprocess.MultiProcessCollector(registry)
45
+ data = generate_latest(registry)
46
+ else:
47
+ data = generate_latest()
48
+ return fastapi.Response(content=data,
49
+ media_type=prom.CONTENT_TYPE_LATEST,
50
+ headers={'Cache-Control': 'no-cache'})
51
+
52
+
53
+ def run_metrics_server(host: str, port: int):
54
+ metrics_config = uvicorn.Config(
55
+ 'sky.server.metrics:metrics_app',
56
+ host=host,
57
+ port=port,
58
+ workers=1,
59
+ )
60
+ metrics_server_instance = uvicorn.Server(metrics_config)
61
+ asyncio.run(metrics_server_instance.serve())
62
+
63
+
64
+ def _get_status_code_group(status_code: int) -> str:
65
+ """Group status codes into classes (2xx, 5xx) to reduce cardinality."""
66
+ return f'{status_code // 100}xx'
67
+
68
+
69
+ def _is_streaming_api(path: str) -> bool:
70
+ """Check if the path is a streaming API."""
71
+ path = path.rstrip('/')
72
+ return path.endswith('/logs') or path.endswith('/api/stream')
73
+
74
+
75
+ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
76
+ """Middleware to collect Prometheus metrics for HTTP requests."""
77
+
78
+ async def dispatch(self, request: fastapi.Request, call_next):
79
+ path = request.url.path
80
+ logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
81
+ streaming = _is_streaming_api(path)
82
+ if not streaming:
83
+ # Exclude streaming APIs, the duration is not meaningful.
84
+ # TODO(aylei): measure the duration of async execution instead.
85
+ start_time = time.time()
86
+ method = request.method
87
+ status_code_group = ''
88
+
89
+ try:
90
+ response = await call_next(request)
91
+ status_code_group = _get_status_code_group(response.status_code)
92
+ except Exception: # pylint: disable=broad-except
93
+ status_code_group = '5xx'
94
+ raise
95
+ finally:
96
+ sky_apiserver_requests_total.labels(path=path,
97
+ method=method,
98
+ status=status_code_group).inc()
99
+ if not streaming:
100
+ duration = time.time() - start_time
101
+ sky_apiserver_request_duration_seconds.labels(
102
+ path=path, method=method,
103
+ status=status_code_group).observe(duration)
104
+
105
+ return response
@@ -149,10 +149,25 @@ class RequestWorker:
149
149
  self.schedule_type = schedule_type
150
150
  self.garanteed_parallelism = config.garanteed_parallelism
151
151
  self.burstable_parallelism = config.burstable_parallelism
152
+ self._thread: Optional[threading.Thread] = None
153
+ self._cancel_event = threading.Event()
152
154
 
153
155
  def __str__(self) -> str:
154
156
  return f'Worker(schedule_type={self.schedule_type.value})'
155
157
 
158
+ def run_in_background(self) -> None:
159
+ # Thread dispatcher is sufficient for current scale, refer to
160
+ # tests/load_tests/test_queue_dispatcher.py for more details.
161
+ # Use daemon thread for automatic cleanup.
162
+ thread = threading.Thread(target=self.run, daemon=True)
163
+ thread.start()
164
+ self._thread = thread
165
+
166
+ def cancel(self) -> None:
167
+ if self._thread is not None:
168
+ self._cancel_event.set()
169
+ self._thread.join()
170
+
156
171
  def process_request(self, executor: process.BurstableExecutor,
157
172
  queue: RequestQueue) -> None:
158
173
  try:
@@ -219,7 +234,7 @@ class RequestWorker:
219
234
  burst_workers=self.burstable_parallelism,
220
235
  initializer=executor_initializer,
221
236
  initargs=(proc_group,))
222
- while True:
237
+ while not self._cancel_event.is_set():
223
238
  self.process_request(executor, queue)
224
239
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
225
240
  except KeyboardInterrupt:
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
539
554
  enqueue()
540
555
 
541
556
 
542
- def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
557
+ def start(
558
+ config: server_config.ServerConfig
559
+ ) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
543
560
  """Start the request workers.
544
561
 
545
562
  Request workers run in background, schedule the requests and delegate the
546
563
  request execution to executor processes.
564
+
565
+ Returns:
566
+ A tuple of the queue server process and the list of request worker
567
+ threads.
547
568
  """
548
569
  global queue_backend
549
570
  queue_backend = config.queue_backend
550
- sub_procs = []
571
+ queue_server = None
551
572
  # Setup the queues.
552
573
  if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
553
574
  logger.info('Creating shared request queues')
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
564
585
  queue_server = multiprocessing.Process(
565
586
  target=mp_queue.start_queue_manager, args=(queue_names, port))
566
587
  queue_server.start()
567
- sub_procs.append(queue_server)
568
588
  mp_queue.wait_for_queues_to_be_ready(queue_names,
569
589
  queue_server,
570
590
  port=port)
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
577
597
 
578
598
  logger.info('Request queues created')
579
599
 
580
- def run_worker_in_background(worker: RequestWorker):
581
- # Thread dispatcher is sufficient for current scale, refer to
582
- # tests/load_tests/test_queue_dispatcher.py for more details.
583
- # Use daemon thread for automatic cleanup.
584
- thread = threading.Thread(target=worker.run, daemon=True)
585
- thread.start()
586
-
600
+ workers = []
587
601
  # Start a worker for long requests.
588
602
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
589
603
  config=config.long_worker_config)
590
- run_worker_in_background(long_worker)
604
+ long_worker.run_in_background()
605
+ workers.append(long_worker)
591
606
 
592
607
  # Start a worker for short requests.
593
608
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
594
609
  config=config.short_worker_config)
595
- run_worker_in_background(short_worker)
596
- return sub_procs
610
+ short_worker.run_in_background()
611
+ workers.append(short_worker)
612
+ return queue_server, workers
@@ -5,7 +5,6 @@ kwargs for the payloads, otherwise, we have to keep the default values the sync
5
5
  with the backend functions. The benefit of having the default values in the
6
6
  payloads is that a user can find the default values in the Restful API docs.
7
7
  """
8
- import getpass
9
8
  import os
10
9
  import typing
11
10
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -58,8 +57,7 @@ def request_body_env_vars() -> dict:
58
57
  if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
59
58
  env_vars[env_var] = os.environ[env_var]
60
59
  env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
61
- env_vars[constants.USER_ENV_VAR] = os.getenv(constants.USER_ENV_VAR,
62
- getpass.getuser())
60
+ env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
63
61
  env_vars[
64
62
  usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
65
63
  # Remove the path to config file, as the config content is included in the
@@ -370,6 +368,22 @@ class StorageBody(RequestBody):
370
368
  name: str
371
369
 
372
370
 
371
+ class VolumeApplyBody(RequestBody):
372
+ """The request body for the volume apply endpoint."""
373
+ name: str
374
+ volume_type: str
375
+ cloud: str
376
+ region: Optional[str] = None
377
+ zone: Optional[str] = None
378
+ size: Optional[str] = None
379
+ config: Optional[Dict[str, Any]] = None
380
+
381
+
382
+ class VolumeDeleteBody(RequestBody):
383
+ """The request body for the volume delete endpoint."""
384
+ names: List[str]
385
+
386
+
373
387
  class EndpointsBody(RequestBody):
374
388
  """The request body for the endpoint."""
375
389
  cluster: str
@@ -613,3 +627,8 @@ class UpdateConfigBody(RequestBody):
613
627
  class GetConfigBody(RequestBody):
614
628
  """The request body for getting the entire SkyPilot configuration."""
615
629
  pass
630
+
631
+
632
+ class CostReportBody(RequestBody):
633
+ """The request body for the cost report endpoint."""
634
+ days: Optional[int] = 30
@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
38
38
  COL_CLUSTER_NAME = 'cluster_name'
39
39
  COL_USER_ID = 'user_id'
40
40
  COL_STATUS_MSG = 'status_msg'
41
+ COL_SHOULD_RETRY = 'should_retry'
41
42
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
42
43
 
43
44
  # TODO(zhwu): For scalability, there are several TODOs:
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
86
87
  'schedule_type',
87
88
  COL_USER_ID,
88
89
  COL_STATUS_MSG,
90
+ COL_SHOULD_RETRY,
89
91
  ]
90
92
 
91
93
 
@@ -115,6 +117,7 @@ class RequestPayload:
115
117
  # Resources the request operates on.
116
118
  cluster_name: Optional[str] = None
117
119
  status_msg: Optional[str] = None
120
+ should_retry: bool = False
118
121
 
119
122
 
120
123
  @dataclasses.dataclass
@@ -137,6 +140,8 @@ class Request:
137
140
  cluster_name: Optional[str] = None
138
141
  # Status message of the request, indicates the reason of current status.
139
142
  status_msg: Optional[str] = None
143
+ # Whether the request should be retried.
144
+ should_retry: bool = False
140
145
 
141
146
  @property
142
147
  def log_path(self) -> pathlib.Path:
@@ -222,6 +227,7 @@ class Request:
222
227
  user_name=user_name,
223
228
  cluster_name=self.cluster_name,
224
229
  status_msg=self.status_msg,
230
+ should_retry=self.should_retry,
225
231
  )
226
232
 
227
233
  def encode(self) -> RequestPayload:
@@ -243,6 +249,7 @@ class Request:
243
249
  user_id=self.user_id,
244
250
  cluster_name=self.cluster_name,
245
251
  status_msg=self.status_msg,
252
+ should_retry=self.should_retry,
246
253
  )
247
254
  except (TypeError, ValueError) as e:
248
255
  # The error is unexpected, so we don't suppress the stack trace.
@@ -274,6 +281,7 @@ class Request:
274
281
  user_id=payload.user_id,
275
282
  cluster_name=payload.cluster_name,
276
283
  status_msg=payload.status_msg,
284
+ should_retry=payload.should_retry,
277
285
  )
278
286
  except (TypeError, ValueError) as e:
279
287
  logger.error(
@@ -327,6 +335,44 @@ def refresh_cluster_status_event():
327
335
  time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
328
336
 
329
337
 
338
+ def refresh_volume_status_event():
339
+ """Periodically refresh the volume status."""
340
+ # pylint: disable=import-outside-toplevel
341
+ from sky.volumes.server import core
342
+
343
+ # Disable logging for periodic refresh to avoid the usage message being
344
+ # sent multiple times.
345
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
346
+
347
+ while True:
348
+ logger.info('=== Refreshing volume status ===')
349
+ core.volume_refresh()
350
+ logger.info('Volume status refreshed. Sleeping '
351
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
352
+ ' seconds for the next refresh...\n')
353
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
354
+
355
+
356
+ def managed_job_status_refresh_event():
357
+ """Refresh the managed job status for controller consolidation mode."""
358
+ # pylint: disable=import-outside-toplevel
359
+ from sky.jobs import utils as managed_job_utils
360
+ if not managed_job_utils.is_consolidation_mode():
361
+ return
362
+ # We run the recovery logic before starting the event loop as those two are
363
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
364
+ from sky.utils import controller_utils
365
+ if controller_utils.high_availability_specified(
366
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
367
+ managed_job_utils.ha_recovery_for_consolidation_mode()
368
+ # After recovery, we start the event loop.
369
+ from sky.skylet import events
370
+ event = events.ManagedJobEvent()
371
+ while True:
372
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
373
+ event.run()
374
+
375
+
330
376
  @dataclasses.dataclass
331
377
  class InternalRequestDaemon:
332
378
  id: str
@@ -341,7 +387,14 @@ INTERNAL_REQUEST_DAEMONS = [
341
387
  # cluster being stopped or down when `sky status -r` is called.
342
388
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
343
389
  name='status',
344
- event_fn=refresh_cluster_status_event)
390
+ event_fn=refresh_cluster_status_event),
391
+ # Volume status refresh daemon to update the volume status periodically.
392
+ InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
393
+ name='volume',
394
+ event_fn=refresh_volume_status_event),
395
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
396
+ name='managed-job-status',
397
+ event_fn=managed_job_status_refresh_event),
345
398
  ]
346
399
 
347
400
 
@@ -423,10 +476,14 @@ def create_table(cursor, conn):
423
476
  {COL_CLUSTER_NAME} TEXT,
424
477
  schedule_type TEXT,
425
478
  {COL_USER_ID} TEXT,
426
- {COL_STATUS_MSG} TEXT)""")
479
+ {COL_STATUS_MSG} TEXT,
480
+ {COL_SHOULD_RETRY} INTEGER
481
+ )""")
427
482
 
428
483
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
429
484
  'TEXT')
485
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
486
+ 'INTEGER')
430
487
 
431
488
 
432
489
  _DB = None
sky/server/rest.py ADDED
@@ -0,0 +1,152 @@
1
+ """REST API client of SkyPilot API server"""
2
+
3
+ import contextlib
4
+ import contextvars
5
+ import functools
6
+ import time
7
+ import typing
8
+ from typing import Any, Callable, cast, Optional, TypeVar
9
+
10
+ import colorama
11
+
12
+ from sky import exceptions
13
+ from sky import sky_logging
14
+ from sky.adaptors import common as adaptors_common
15
+ from sky.utils import common_utils
16
+ from sky.utils import rich_utils
17
+ from sky.utils import ux_utils
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+ if typing.TYPE_CHECKING:
22
+ import requests
23
+
24
+ else:
25
+ requests = adaptors_common.LazyImport('requests')
26
+
27
+ F = TypeVar('F', bound=Callable[..., Any])
28
+
29
+ _RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
30
+
31
+
32
+ class RetryContext:
33
+
34
+ def __init__(self):
35
+ self.line_processed = 0
36
+
37
+
38
+ def retry_on_server_unavailable(max_wait_seconds: int = 600,
39
+ initial_backoff: float = 5.0,
40
+ max_backoff_factor: int = 5):
41
+ """Decorator that retries a function when ServerTemporarilyUnavailableError
42
+ is caught.
43
+
44
+ Args:
45
+ max_wait_seconds: Maximum number of seconds to wait for the server to
46
+ be healthy
47
+ initial_backoff: Initial backoff time in seconds
48
+ max_backoff_factor: Maximum backoff factor for exponential backoff
49
+
50
+ Notes(dev):
51
+ This decorator is mainly used in two scenarios:
52
+ 1. Decorate a Restful API call to make the API call wait for server
53
+ recovery when server is temporarily unavailable. APIs like /api/get
54
+ and /api/stream should not be retried since sending them to a new
55
+ replica of API server will not work.
56
+ 2. Decorate a SDK function to make the entire SDK function call get
57
+ retried when /api/get or /logs raises a retryable error. This
58
+ is typically triggered by a graceful upgrade of the API server,
59
+ where the pending requests and logs requests will be interrupted.
60
+ """
61
+
62
+ def decorator(func: F) -> F:
63
+
64
+ @functools.wraps(func)
65
+ def wrapper(*args, **kwargs) -> Any:
66
+ msg = (
67
+ f'{colorama.Fore.YELLOW}API server is temporarily: upgrade in '
68
+ f'progress. Waiting to resume...{colorama.Style.RESET_ALL}')
69
+ backoff = common_utils.Backoff(
70
+ initial_backoff=initial_backoff,
71
+ max_backoff_factor=max_backoff_factor)
72
+ start_time = time.time()
73
+ attempt = 0
74
+
75
+ with _retry_in_context():
76
+ while True:
77
+ attempt += 1
78
+ try:
79
+ return func(*args, **kwargs)
80
+ except exceptions.ServerTemporarilyUnavailableError as e:
81
+ # This will cause the status spinner being stopped and
82
+ # restarted in every retry loop. But it is necessary to
83
+ # stop the status spinner before retrying func() to
84
+ # avoid the status spinner get stuck if the func() runs
85
+ # for a long time without update status, e.g. sky logs.
86
+ with rich_utils.client_status(msg):
87
+ if time.time() - start_time > max_wait_seconds:
88
+ # pylint: disable=line-too-long
89
+ raise exceptions.ServerTemporarilyUnavailableError(
90
+ 'Timeout waiting for the API server to be '
91
+ f'available after {max_wait_seconds}s.') \
92
+ from e
93
+
94
+ sleep_time = backoff.current_backoff()
95
+ time.sleep(sleep_time)
96
+ logger.debug('The API server is unavailable. '
97
+ f'Retrying {func.__name__} '
98
+ f'(attempt {attempt}, '
99
+ f'backoff {sleep_time}s).')
100
+
101
+ return cast(F, wrapper)
102
+
103
+ return decorator
104
+
105
+
106
+ @contextlib.contextmanager
107
+ def _retry_in_context():
108
+ token = _RETRY_CONTEXT.set(RetryContext())
109
+ try:
110
+ yield
111
+ finally:
112
+ _RETRY_CONTEXT.reset(token)
113
+
114
+
115
+ def get_retry_context() -> Optional[RetryContext]:
116
+ return _RETRY_CONTEXT.get()
117
+
118
+
119
+ def handle_server_unavailable(response: 'requests.Response') -> None:
120
+ if response.status_code == 503:
121
+ # TODO(aylei): Hacky, depends on how nginx controller handles backends
122
+ # with no ready endpoints. Should use self-defined status code or header
123
+ # to distinguish retryable server error from general 503 errors.
124
+ with ux_utils.print_exception_no_traceback():
125
+ raise exceptions.ServerTemporarilyUnavailableError(
126
+ 'SkyPilot API server is temporarily unavailable. '
127
+ 'Please try again later.')
128
+
129
+
130
+ @retry_on_server_unavailable()
131
+ def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
132
+ """Send a POST request to the API server, retry on server temporarily
133
+ unavailable."""
134
+ response = requests.post(url, data=data, json=json, **kwargs)
135
+ handle_server_unavailable(response)
136
+ return response
137
+
138
+
139
+ @retry_on_server_unavailable()
140
+ def get(url, params=None, **kwargs) -> 'requests.Response':
141
+ """Send a GET request to the API server, retry on server temporarily
142
+ unavailable."""
143
+ response = requests.get(url, params=params, **kwargs)
144
+ handle_server_unavailable(response)
145
+ return response
146
+
147
+
148
+ def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
149
+ """Send a GET request to the API server without retry."""
150
+ response = requests.get(url, params=params, **kwargs)
151
+ handle_server_unavailable(response)
152
+ return response