skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/resources.py CHANGED
@@ -30,6 +30,9 @@ from sky.utils import resources_utils
30
30
  from sky.utils import schemas
31
31
  from sky.utils import ux_utils
32
32
 
33
+ if typing.TYPE_CHECKING:
34
+ from sky.volumes import volume as volume_lib
35
+
33
36
  logger = sky_logging.init_logger(__name__)
34
37
 
35
38
  _DEFAULT_DISK_SIZE_GB = 256
@@ -289,7 +292,8 @@ class Resources:
289
292
  self._job_recovery = job_recovery
290
293
 
291
294
  if disk_size is not None:
292
- self._disk_size = int(parse_memory_resource(disk_size, 'disk_size'))
295
+ self._disk_size = int(
296
+ resources_utils.parse_memory_resource(disk_size, 'disk_size'))
293
297
  else:
294
298
  self._disk_size = _DEFAULT_DISK_SIZE_GB
295
299
 
@@ -707,11 +711,11 @@ class Resources:
707
711
  self._memory = None
708
712
  return
709
713
 
710
- memory = parse_memory_resource(str(memory),
711
- 'memory',
712
- ret_type=float,
713
- allow_plus=True,
714
- allow_x=True)
714
+ memory = resources_utils.parse_memory_resource(str(memory),
715
+ 'memory',
716
+ ret_type=float,
717
+ allow_plus=True,
718
+ allow_x=True)
715
719
  self._memory = memory
716
720
  if memory.endswith(('+', 'x')):
717
721
  # 'x' is used internally for make sure our resources used by
@@ -1465,11 +1469,15 @@ class Resources:
1465
1469
  def get_spot_str(self) -> str:
1466
1470
  return '[Spot]' if self.use_spot else ''
1467
1471
 
1468
- def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
1469
- region: clouds.Region,
1470
- zones: Optional[List[clouds.Zone]],
1471
- num_nodes: int,
1472
- dryrun: bool) -> Dict[str, Optional[str]]:
1472
+ def make_deploy_variables(
1473
+ self,
1474
+ cluster_name: resources_utils.ClusterName,
1475
+ region: clouds.Region,
1476
+ zones: Optional[List[clouds.Zone]],
1477
+ num_nodes: int,
1478
+ dryrun: bool,
1479
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
1480
+ ) -> Dict[str, Optional[str]]:
1473
1481
  """Converts planned sky.Resources to resource variables.
1474
1482
 
1475
1483
  These variables are divided into two categories: cloud-specific and
@@ -1491,7 +1499,7 @@ class Resources:
1491
1499
  # Cloud specific variables
1492
1500
  assert self.cloud is not None, 'Cloud must be specified'
1493
1501
  cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1494
- self, cluster_name, region, zones, num_nodes, dryrun)
1502
+ self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
1495
1503
 
1496
1504
  # TODO(andyl): Should we print some warnings if users' envs share
1497
1505
  # same names with the cloud specific variables, but not enabled
@@ -2291,67 +2299,3 @@ def parse_time_minutes(time: str) -> int:
2291
2299
  continue
2292
2300
 
2293
2301
  raise ValueError(f'Invalid time format: {time}')
2294
-
2295
-
2296
- def parse_memory_resource(resource_qty_str: Union[str, int, float],
2297
- field_name: str,
2298
- ret_type: type = int,
2299
- unit: str = 'gb',
2300
- allow_plus: bool = False,
2301
- allow_x: bool = False,
2302
- allow_rounding: bool = False) -> str:
2303
- """Returns memory size in chosen units given a resource quantity string.
2304
-
2305
- Args:
2306
- resource_qty_str: Resource quantity string
2307
- unit: Unit to convert to
2308
- allow_plus: Whether to allow '+' prefix
2309
- allow_x: Whether to allow 'x' suffix
2310
- """
2311
- assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
2312
-
2313
- error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
2314
- f' got {resource_qty_str}'
2315
-
2316
- resource_str = str(resource_qty_str)
2317
-
2318
- # Handle plus and x suffixes, x is only used internally for jobs controller
2319
- plus = ''
2320
- if resource_str.endswith('+'):
2321
- if allow_plus:
2322
- resource_str = resource_str[:-1]
2323
- plus = '+'
2324
- else:
2325
- raise ValueError(error_msg)
2326
-
2327
- x = ''
2328
- if resource_str.endswith('x'):
2329
- if allow_x:
2330
- resource_str = resource_str[:-1]
2331
- x = 'x'
2332
- else:
2333
- raise ValueError(error_msg)
2334
-
2335
- try:
2336
- # We assume it is already in the wanted units to maintain backwards
2337
- # compatibility
2338
- ret_type(resource_str)
2339
- return f'{resource_str}{plus}{x}'
2340
- except ValueError:
2341
- pass
2342
-
2343
- resource_str = resource_str.lower()
2344
- for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
2345
- if resource_str.endswith(mem_unit):
2346
- try:
2347
- value = ret_type(resource_str[:-len(mem_unit)])
2348
- converted = (value * multiplier /
2349
- constants.MEMORY_SIZE_UNITS[unit])
2350
- if not allow_rounding and ret_type(converted) != converted:
2351
- raise ValueError(error_msg)
2352
- converted = ret_type(converted)
2353
- return f'{converted}{plus}{x}'
2354
- except ValueError:
2355
- continue
2356
-
2357
- raise ValueError(error_msg)
sky/serve/client/sdk.py CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional, Union
5
5
 
6
6
  import click
7
7
 
8
- from sky.adaptors import common as adaptors_common
9
8
  from sky.client import common as client_common
10
9
  from sky.server import common as server_common
10
+ from sky.server import rest
11
11
  from sky.server.requests import payloads
12
12
  from sky.usage import usage_lib
13
13
  from sky.utils import admin_policy_utils
@@ -17,12 +17,8 @@ from sky.utils import dag_utils
17
17
  if typing.TYPE_CHECKING:
18
18
  import io
19
19
 
20
- import requests
21
-
22
20
  import sky
23
21
  from sky.serve import serve_utils
24
- else:
25
- requests = adaptors_common.LazyImport('requests')
26
22
 
27
23
 
28
24
  @context.contextual
@@ -78,7 +74,7 @@ def up(
78
74
  task=dag_str,
79
75
  service_name=service_name,
80
76
  )
81
- response = requests.post(
77
+ response = rest.post(
82
78
  f'{server_common.get_server_url()}/serve/up',
83
79
  json=json.loads(body.model_dump_json()),
84
80
  timeout=(5, None),
@@ -140,7 +136,7 @@ def update(
140
136
  mode=mode,
141
137
  )
142
138
 
143
- response = requests.post(
139
+ response = rest.post(
144
140
  f'{server_common.get_server_url()}/serve/update',
145
141
  json=json.loads(body.model_dump_json()),
146
142
  timeout=(5, None),
@@ -182,7 +178,7 @@ def down(
182
178
  all=all,
183
179
  purge=purge,
184
180
  )
185
- response = requests.post(
181
+ response = rest.post(
186
182
  f'{server_common.get_server_url()}/serve/down',
187
183
  json=json.loads(body.model_dump_json()),
188
184
  timeout=(5, None),
@@ -217,7 +213,7 @@ def terminate_replica(service_name: str, replica_id: int,
217
213
  replica_id=replica_id,
218
214
  purge=purge,
219
215
  )
220
- response = requests.post(
216
+ response = rest.post(
221
217
  f'{server_common.get_server_url()}/serve/terminate-replica',
222
218
  json=json.loads(body.model_dump_json()),
223
219
  timeout=(5, None),
@@ -290,7 +286,7 @@ def status(
290
286
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
291
287
  """
292
288
  body = payloads.ServeStatusBody(service_names=service_names,)
293
- response = requests.post(
289
+ response = rest.post(
294
290
  f'{server_common.get_server_url()}/serve/status',
295
291
  json=json.loads(body.model_dump_json()),
296
292
  timeout=(5, None),
@@ -301,6 +297,7 @@ def status(
301
297
 
302
298
  @usage_lib.entrypoint
303
299
  @server_common.check_server_healthy_or_start
300
+ @rest.retry_on_server_unavailable()
304
301
  def tail_logs(service_name: str,
305
302
  target: Union[str, 'serve_utils.ServiceComponent'],
306
303
  replica_id: Optional[int] = None,
@@ -376,7 +373,7 @@ def tail_logs(service_name: str,
376
373
  replica_id=replica_id,
377
374
  follow=follow,
378
375
  )
379
- response = requests.post(
376
+ response = rest.post(
380
377
  f'{server_common.get_server_url()}/serve/logs',
381
378
  json=json.loads(body.model_dump_json()),
382
379
  timeout=(5, None),
@@ -384,7 +381,10 @@ def tail_logs(service_name: str,
384
381
  cookies=server_common.get_api_cookie_jar(),
385
382
  )
386
383
  request_id = server_common.get_request_id(response)
387
- sdk.stream_response(request_id, response, output_stream)
384
+ return sdk.stream_response(request_id=request_id,
385
+ response=response,
386
+ output_stream=output_stream,
387
+ resumable=True)
388
388
 
389
389
 
390
390
  @usage_lib.entrypoint
@@ -436,7 +436,7 @@ def sync_down_logs(service_name: str,
436
436
  targets=targets,
437
437
  replica_ids=replica_ids,
438
438
  )
439
- response = requests.post(
439
+ response = rest.post(
440
440
  f'{server_common.get_server_url()}/serve/sync-down-logs',
441
441
  json=json.loads(body.model_dump_json()),
442
442
  timeout=(5, None),
sky/serve/server/core.py CHANGED
@@ -28,6 +28,7 @@ from sky.utils import command_runner
28
28
  from sky.utils import common
29
29
  from sky.utils import common_utils
30
30
  from sky.utils import controller_utils
31
+ from sky.utils import dag_utils
31
32
  from sky.utils import rich_utils
32
33
  from sky.utils import subprocess_utils
33
34
  from sky.utils import ux_utils
@@ -139,10 +140,13 @@ def up(
139
140
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
140
141
 
141
142
  serve_utils.validate_service_task(task)
143
+ dag = dag_utils.convert_entrypoint_to_dag(task)
144
+ dag.resolve_and_validate_volumes()
142
145
  # Always apply the policy again here, even though it might have been applied
143
146
  # in the CLI. This is to ensure that we apply the policy to the final DAG
144
147
  # and get the mutated config.
145
- dag, mutated_user_config = admin_policy_utils.apply(task)
148
+ dag, mutated_user_config = admin_policy_utils.apply(dag)
149
+ dag.pre_mount_volumes()
146
150
  task = dag.tasks[0]
147
151
 
148
152
  with rich_utils.safe_status(
sky/server/common.py CHANGED
@@ -9,11 +9,13 @@ import json
9
9
  import os
10
10
  import pathlib
11
11
  import re
12
+ import shutil
12
13
  import subprocess
13
14
  import sys
15
+ import tempfile
14
16
  import time
15
17
  import typing
16
- from typing import Any, Dict, Literal, Optional, Tuple
18
+ from typing import Any, Dict, Literal, Optional, Tuple, Union
17
19
  from urllib import parse
18
20
  import uuid
19
21
 
@@ -27,6 +29,7 @@ from sky import skypilot_config
27
29
  from sky.adaptors import common as adaptors_common
28
30
  from sky.data import data_utils
29
31
  from sky.server import constants as server_constants
32
+ from sky.server import rest
30
33
  from sky.skylet import constants
31
34
  from sky.usage import usage_lib
32
35
  from sky.utils import annotations
@@ -240,9 +243,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
240
243
  server_url = endpoint if endpoint is not None else get_server_url()
241
244
  while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
242
245
  try:
243
- response = requests.get(f'{server_url}/api/health',
244
- timeout=2.5,
245
- cookies=get_api_cookie_jar())
246
+ response = rest.get(f'{server_url}/api/health',
247
+ timeout=2.5,
248
+ cookies=get_api_cookie_jar())
246
249
  except requests.exceptions.Timeout:
247
250
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
248
251
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
@@ -327,6 +330,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
327
330
  def _start_api_server(deploy: bool = False,
328
331
  host: str = '127.0.0.1',
329
332
  foreground: bool = False,
333
+ metrics: bool = False,
334
+ metrics_port: Optional[int] = None,
330
335
  enable_basic_auth: bool = False):
331
336
  """Starts a SkyPilot API server locally."""
332
337
  server_url = get_server_url(host)
@@ -357,10 +362,13 @@ def _start_api_server(deploy: bool = False,
357
362
  args += ['--deploy']
358
363
  if host is not None:
359
364
  args += [f'--host={host}']
365
+ if metrics_port is not None:
366
+ args += [f'--metrics-port={metrics_port}']
360
367
 
361
368
  if foreground:
362
369
  # Replaces the current process with the API server
363
370
  os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
371
+ _set_metrics_env_var(os.environ, metrics, deploy)
364
372
  if enable_basic_auth:
365
373
  os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
366
374
  os.execvp(args[0], args)
@@ -368,6 +376,10 @@ def _start_api_server(deploy: bool = False,
368
376
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
369
377
  os.makedirs(os.path.dirname(log_path), exist_ok=True)
370
378
 
379
+ # For spawn mode, copy the environ to avoid polluting the SDK process.
380
+ server_env = os.environ.copy()
381
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
382
+ _set_metrics_env_var(server_env, metrics, deploy)
371
383
  # Start the API server process in the background and don't wait for it.
372
384
  # If this is called from a CLI invocation, we need
373
385
  # start_new_session=True so that SIGINT on the CLI will not also kill
@@ -437,6 +449,26 @@ def _start_api_server(deploy: bool = False,
437
449
  f'SkyPilot API server started. {dashboard_msg}'))
438
450
 
439
451
 
452
+ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
453
+ deploy: bool):
454
+ """Sets the metrics environment variables.
455
+
456
+ Args:
457
+ env: The environment variables to set.
458
+ metrics: Whether to enable metrics.
459
+ deploy: Whether the server is running in deploy mode, which means
460
+ multiple processes might be running.
461
+ """
462
+ if metrics:
463
+ env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
464
+ if deploy:
465
+ metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
466
+ shutil.rmtree(metrics_dir, ignore_errors=True)
467
+ os.makedirs(metrics_dir, exist_ok=True)
468
+ # Refer to https://prometheus.github.io/client_python/multiprocess/
469
+ env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
470
+
471
+
440
472
  def check_server_healthy(
441
473
  endpoint: Optional[str] = None
442
474
  ) -> Tuple[Literal[
@@ -571,6 +603,8 @@ def get_skypilot_version_on_disk() -> str:
571
603
  def check_server_healthy_or_start_fn(deploy: bool = False,
572
604
  host: str = '127.0.0.1',
573
605
  foreground: bool = False,
606
+ metrics: bool = False,
607
+ metrics_port: Optional[int] = None,
574
608
  enable_basic_auth: bool = False):
575
609
  api_server_status = None
576
610
  try:
@@ -592,7 +626,8 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
592
626
  # have started the server while we were waiting for the lock.
593
627
  api_server_info = get_api_server_status(endpoint)
594
628
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
595
- _start_api_server(deploy, host, foreground, enable_basic_auth)
629
+ _start_api_server(deploy, host, foreground, metrics,
630
+ metrics_port, enable_basic_auth)
596
631
 
597
632
 
598
633
  def check_server_healthy_or_start(func):
sky/server/constants.py CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
7
7
  # API server version, whenever there is a change in API server that requires a
8
8
  # restart of the local API server or error out when the client does not match
9
9
  # the server version.
10
- API_VERSION = '9'
10
+ API_VERSION = '10'
11
11
 
12
12
  # Prefix for API request names.
13
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -22,6 +22,10 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
22
22
  # background.
23
23
  CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
24
 
25
+ # The interval (seconds) for the volume status to be refreshed in the
26
+ # background.
27
+ VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
28
+
25
29
  # Environment variable for a file path to the API cookie file.
26
30
  # Keep in sync with websocket_proxy.py
27
31
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
sky/server/metrics.py ADDED
@@ -0,0 +1,105 @@
1
+ """Instrumentation for the API server."""
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+
7
+ import fastapi
8
+ from prometheus_client import generate_latest
9
+ from prometheus_client import multiprocess
10
+ import prometheus_client as prom
11
+ import starlette.middleware.base
12
+ import uvicorn
13
+
14
+ from sky import sky_logging
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ # Total number of API server requests, grouped by path, method, and status.
19
+ sky_apiserver_requests_total = prom.Counter(
20
+ 'sky_apiserver_requests_total',
21
+ 'Total number of API server requests',
22
+ ['path', 'method', 'status'],
23
+ )
24
+
25
+ # Time spent processing API server requests, grouped by path, method, and
26
+ # status.
27
+ sky_apiserver_request_duration_seconds = prom.Histogram(
28
+ 'sky_apiserver_request_duration_seconds',
29
+ 'Time spent processing API server requests',
30
+ ['path', 'method', 'status'],
31
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
32
+ float('inf')),
33
+ )
34
+
35
+ metrics_app = fastapi.FastAPI()
36
+
37
+
38
+ @metrics_app.get('/metrics')
39
+ async def metrics() -> fastapi.Response:
40
+ """Expose aggregated Prometheus metrics from all worker processes."""
41
+ if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
42
+ # In multiprocess mode, we need to collect metrics from all processes.
43
+ registry = prom.CollectorRegistry()
44
+ multiprocess.MultiProcessCollector(registry)
45
+ data = generate_latest(registry)
46
+ else:
47
+ data = generate_latest()
48
+ return fastapi.Response(content=data,
49
+ media_type=prom.CONTENT_TYPE_LATEST,
50
+ headers={'Cache-Control': 'no-cache'})
51
+
52
+
53
+ def run_metrics_server(host: str, port: int):
54
+ metrics_config = uvicorn.Config(
55
+ 'sky.server.metrics:metrics_app',
56
+ host=host,
57
+ port=port,
58
+ workers=1,
59
+ )
60
+ metrics_server_instance = uvicorn.Server(metrics_config)
61
+ asyncio.run(metrics_server_instance.serve())
62
+
63
+
64
+ def _get_status_code_group(status_code: int) -> str:
65
+ """Group status codes into classes (2xx, 5xx) to reduce cardinality."""
66
+ return f'{status_code // 100}xx'
67
+
68
+
69
+ def _is_streaming_api(path: str) -> bool:
70
+ """Check if the path is a streaming API."""
71
+ path = path.rstrip('/')
72
+ return path.endswith('/logs') or path.endswith('/api/stream')
73
+
74
+
75
+ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
76
+ """Middleware to collect Prometheus metrics for HTTP requests."""
77
+
78
+ async def dispatch(self, request: fastapi.Request, call_next):
79
+ path = request.url.path
80
+ logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
81
+ streaming = _is_streaming_api(path)
82
+ if not streaming:
83
+ # Exclude streaming APIs, the duration is not meaningful.
84
+ # TODO(aylei): measure the duration of async execution instead.
85
+ start_time = time.time()
86
+ method = request.method
87
+ status_code_group = ''
88
+
89
+ try:
90
+ response = await call_next(request)
91
+ status_code_group = _get_status_code_group(response.status_code)
92
+ except Exception: # pylint: disable=broad-except
93
+ status_code_group = '5xx'
94
+ raise
95
+ finally:
96
+ sky_apiserver_requests_total.labels(path=path,
97
+ method=method,
98
+ status=status_code_group).inc()
99
+ if not streaming:
100
+ duration = time.time() - start_time
101
+ sky_apiserver_request_duration_seconds.labels(
102
+ path=path, method=method,
103
+ status=status_code_group).observe(duration)
104
+
105
+ return response
@@ -149,10 +149,25 @@ class RequestWorker:
149
149
  self.schedule_type = schedule_type
150
150
  self.garanteed_parallelism = config.garanteed_parallelism
151
151
  self.burstable_parallelism = config.burstable_parallelism
152
+ self._thread: Optional[threading.Thread] = None
153
+ self._cancel_event = threading.Event()
152
154
 
153
155
  def __str__(self) -> str:
154
156
  return f'Worker(schedule_type={self.schedule_type.value})'
155
157
 
158
+ def run_in_background(self) -> None:
159
+ # Thread dispatcher is sufficient for current scale, refer to
160
+ # tests/load_tests/test_queue_dispatcher.py for more details.
161
+ # Use daemon thread for automatic cleanup.
162
+ thread = threading.Thread(target=self.run, daemon=True)
163
+ thread.start()
164
+ self._thread = thread
165
+
166
+ def cancel(self) -> None:
167
+ if self._thread is not None:
168
+ self._cancel_event.set()
169
+ self._thread.join()
170
+
156
171
  def process_request(self, executor: process.BurstableExecutor,
157
172
  queue: RequestQueue) -> None:
158
173
  try:
@@ -219,7 +234,7 @@ class RequestWorker:
219
234
  burst_workers=self.burstable_parallelism,
220
235
  initializer=executor_initializer,
221
236
  initargs=(proc_group,))
222
- while True:
237
+ while not self._cancel_event.is_set():
223
238
  self.process_request(executor, queue)
224
239
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
225
240
  except KeyboardInterrupt:
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
539
554
  enqueue()
540
555
 
541
556
 
542
- def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
557
+ def start(
558
+ config: server_config.ServerConfig
559
+ ) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
543
560
  """Start the request workers.
544
561
 
545
562
  Request workers run in background, schedule the requests and delegate the
546
563
  request execution to executor processes.
564
+
565
+ Returns:
566
+ A tuple of the queue server process and the list of request worker
567
+ threads.
547
568
  """
548
569
  global queue_backend
549
570
  queue_backend = config.queue_backend
550
- sub_procs = []
571
+ queue_server = None
551
572
  # Setup the queues.
552
573
  if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
553
574
  logger.info('Creating shared request queues')
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
564
585
  queue_server = multiprocessing.Process(
565
586
  target=mp_queue.start_queue_manager, args=(queue_names, port))
566
587
  queue_server.start()
567
- sub_procs.append(queue_server)
568
588
  mp_queue.wait_for_queues_to_be_ready(queue_names,
569
589
  queue_server,
570
590
  port=port)
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
577
597
 
578
598
  logger.info('Request queues created')
579
599
 
580
- def run_worker_in_background(worker: RequestWorker):
581
- # Thread dispatcher is sufficient for current scale, refer to
582
- # tests/load_tests/test_queue_dispatcher.py for more details.
583
- # Use daemon thread for automatic cleanup.
584
- thread = threading.Thread(target=worker.run, daemon=True)
585
- thread.start()
586
-
600
+ workers = []
587
601
  # Start a worker for long requests.
588
602
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
589
603
  config=config.long_worker_config)
590
- run_worker_in_background(long_worker)
604
+ long_worker.run_in_background()
605
+ workers.append(long_worker)
591
606
 
592
607
  # Start a worker for short requests.
593
608
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
594
609
  config=config.short_worker_config)
595
- run_worker_in_background(short_worker)
596
- return sub_procs
610
+ short_worker.run_in_background()
611
+ workers.append(short_worker)
612
+ return queue_server, workers
@@ -368,6 +368,22 @@ class StorageBody(RequestBody):
368
368
  name: str
369
369
 
370
370
 
371
+ class VolumeApplyBody(RequestBody):
372
+ """The request body for the volume apply endpoint."""
373
+ name: str
374
+ volume_type: str
375
+ cloud: str
376
+ region: Optional[str] = None
377
+ zone: Optional[str] = None
378
+ size: Optional[str] = None
379
+ config: Optional[Dict[str, Any]] = None
380
+
381
+
382
+ class VolumeDeleteBody(RequestBody):
383
+ """The request body for the volume delete endpoint."""
384
+ names: List[str]
385
+
386
+
371
387
  class EndpointsBody(RequestBody):
372
388
  """The request body for the endpoint."""
373
389
  cluster: str