skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +1 -6
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +232 -9
- sky/client/sdk.py +195 -91
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/ssh.py +36 -0
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +21 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
- sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +70 -4
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +153 -0
- sky/server/server.py +70 -43
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -3
- sky/skypilot_config.py +3 -0
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +133 -0
- sky/ssh_node_pools/server.py +232 -0
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
@@ -30,6 +30,9 @@ from sky.utils import resources_utils
|
|
30
30
|
from sky.utils import schemas
|
31
31
|
from sky.utils import ux_utils
|
32
32
|
|
33
|
+
if typing.TYPE_CHECKING:
|
34
|
+
from sky.volumes import volume as volume_lib
|
35
|
+
|
33
36
|
logger = sky_logging.init_logger(__name__)
|
34
37
|
|
35
38
|
_DEFAULT_DISK_SIZE_GB = 256
|
@@ -289,7 +292,8 @@ class Resources:
|
|
289
292
|
self._job_recovery = job_recovery
|
290
293
|
|
291
294
|
if disk_size is not None:
|
292
|
-
self._disk_size = int(
|
295
|
+
self._disk_size = int(
|
296
|
+
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
293
297
|
else:
|
294
298
|
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
295
299
|
|
@@ -707,11 +711,11 @@ class Resources:
|
|
707
711
|
self._memory = None
|
708
712
|
return
|
709
713
|
|
710
|
-
memory = parse_memory_resource(str(memory),
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
714
|
+
memory = resources_utils.parse_memory_resource(str(memory),
|
715
|
+
'memory',
|
716
|
+
ret_type=float,
|
717
|
+
allow_plus=True,
|
718
|
+
allow_x=True)
|
715
719
|
self._memory = memory
|
716
720
|
if memory.endswith(('+', 'x')):
|
717
721
|
# 'x' is used internally for make sure our resources used by
|
@@ -1465,11 +1469,15 @@ class Resources:
|
|
1465
1469
|
def get_spot_str(self) -> str:
|
1466
1470
|
return '[Spot]' if self.use_spot else ''
|
1467
1471
|
|
1468
|
-
def make_deploy_variables(
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1472
|
+
def make_deploy_variables(
|
1473
|
+
self,
|
1474
|
+
cluster_name: resources_utils.ClusterName,
|
1475
|
+
region: clouds.Region,
|
1476
|
+
zones: Optional[List[clouds.Zone]],
|
1477
|
+
num_nodes: int,
|
1478
|
+
dryrun: bool,
|
1479
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
1480
|
+
) -> Dict[str, Optional[str]]:
|
1473
1481
|
"""Converts planned sky.Resources to resource variables.
|
1474
1482
|
|
1475
1483
|
These variables are divided into two categories: cloud-specific and
|
@@ -1491,7 +1499,7 @@ class Resources:
|
|
1491
1499
|
# Cloud specific variables
|
1492
1500
|
assert self.cloud is not None, 'Cloud must be specified'
|
1493
1501
|
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1494
|
-
self, cluster_name, region, zones, num_nodes, dryrun)
|
1502
|
+
self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
|
1495
1503
|
|
1496
1504
|
# TODO(andyl): Should we print some warnings if users' envs share
|
1497
1505
|
# same names with the cloud specific variables, but not enabled
|
@@ -2291,67 +2299,3 @@ def parse_time_minutes(time: str) -> int:
|
|
2291
2299
|
continue
|
2292
2300
|
|
2293
2301
|
raise ValueError(f'Invalid time format: {time}')
|
2294
|
-
|
2295
|
-
|
2296
|
-
def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
2297
|
-
field_name: str,
|
2298
|
-
ret_type: type = int,
|
2299
|
-
unit: str = 'gb',
|
2300
|
-
allow_plus: bool = False,
|
2301
|
-
allow_x: bool = False,
|
2302
|
-
allow_rounding: bool = False) -> str:
|
2303
|
-
"""Returns memory size in chosen units given a resource quantity string.
|
2304
|
-
|
2305
|
-
Args:
|
2306
|
-
resource_qty_str: Resource quantity string
|
2307
|
-
unit: Unit to convert to
|
2308
|
-
allow_plus: Whether to allow '+' prefix
|
2309
|
-
allow_x: Whether to allow 'x' suffix
|
2310
|
-
"""
|
2311
|
-
assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
|
2312
|
-
|
2313
|
-
error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
|
2314
|
-
f' got {resource_qty_str}'
|
2315
|
-
|
2316
|
-
resource_str = str(resource_qty_str)
|
2317
|
-
|
2318
|
-
# Handle plus and x suffixes, x is only used internally for jobs controller
|
2319
|
-
plus = ''
|
2320
|
-
if resource_str.endswith('+'):
|
2321
|
-
if allow_plus:
|
2322
|
-
resource_str = resource_str[:-1]
|
2323
|
-
plus = '+'
|
2324
|
-
else:
|
2325
|
-
raise ValueError(error_msg)
|
2326
|
-
|
2327
|
-
x = ''
|
2328
|
-
if resource_str.endswith('x'):
|
2329
|
-
if allow_x:
|
2330
|
-
resource_str = resource_str[:-1]
|
2331
|
-
x = 'x'
|
2332
|
-
else:
|
2333
|
-
raise ValueError(error_msg)
|
2334
|
-
|
2335
|
-
try:
|
2336
|
-
# We assume it is already in the wanted units to maintain backwards
|
2337
|
-
# compatibility
|
2338
|
-
ret_type(resource_str)
|
2339
|
-
return f'{resource_str}{plus}{x}'
|
2340
|
-
except ValueError:
|
2341
|
-
pass
|
2342
|
-
|
2343
|
-
resource_str = resource_str.lower()
|
2344
|
-
for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
|
2345
|
-
if resource_str.endswith(mem_unit):
|
2346
|
-
try:
|
2347
|
-
value = ret_type(resource_str[:-len(mem_unit)])
|
2348
|
-
converted = (value * multiplier /
|
2349
|
-
constants.MEMORY_SIZE_UNITS[unit])
|
2350
|
-
if not allow_rounding and ret_type(converted) != converted:
|
2351
|
-
raise ValueError(error_msg)
|
2352
|
-
converted = ret_type(converted)
|
2353
|
-
return f'{converted}{plus}{x}'
|
2354
|
-
except ValueError:
|
2355
|
-
continue
|
2356
|
-
|
2357
|
-
raise ValueError(error_msg)
|
sky/serve/client/sdk.py
CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional, Union
|
|
5
5
|
|
6
6
|
import click
|
7
7
|
|
8
|
-
from sky.adaptors import common as adaptors_common
|
9
8
|
from sky.client import common as client_common
|
10
9
|
from sky.server import common as server_common
|
10
|
+
from sky.server import rest
|
11
11
|
from sky.server.requests import payloads
|
12
12
|
from sky.usage import usage_lib
|
13
13
|
from sky.utils import admin_policy_utils
|
@@ -17,12 +17,8 @@ from sky.utils import dag_utils
|
|
17
17
|
if typing.TYPE_CHECKING:
|
18
18
|
import io
|
19
19
|
|
20
|
-
import requests
|
21
|
-
|
22
20
|
import sky
|
23
21
|
from sky.serve import serve_utils
|
24
|
-
else:
|
25
|
-
requests = adaptors_common.LazyImport('requests')
|
26
22
|
|
27
23
|
|
28
24
|
@context.contextual
|
@@ -78,7 +74,7 @@ def up(
|
|
78
74
|
task=dag_str,
|
79
75
|
service_name=service_name,
|
80
76
|
)
|
81
|
-
response =
|
77
|
+
response = rest.post(
|
82
78
|
f'{server_common.get_server_url()}/serve/up',
|
83
79
|
json=json.loads(body.model_dump_json()),
|
84
80
|
timeout=(5, None),
|
@@ -140,7 +136,7 @@ def update(
|
|
140
136
|
mode=mode,
|
141
137
|
)
|
142
138
|
|
143
|
-
response =
|
139
|
+
response = rest.post(
|
144
140
|
f'{server_common.get_server_url()}/serve/update',
|
145
141
|
json=json.loads(body.model_dump_json()),
|
146
142
|
timeout=(5, None),
|
@@ -182,7 +178,7 @@ def down(
|
|
182
178
|
all=all,
|
183
179
|
purge=purge,
|
184
180
|
)
|
185
|
-
response =
|
181
|
+
response = rest.post(
|
186
182
|
f'{server_common.get_server_url()}/serve/down',
|
187
183
|
json=json.loads(body.model_dump_json()),
|
188
184
|
timeout=(5, None),
|
@@ -217,7 +213,7 @@ def terminate_replica(service_name: str, replica_id: int,
|
|
217
213
|
replica_id=replica_id,
|
218
214
|
purge=purge,
|
219
215
|
)
|
220
|
-
response =
|
216
|
+
response = rest.post(
|
221
217
|
f'{server_common.get_server_url()}/serve/terminate-replica',
|
222
218
|
json=json.loads(body.model_dump_json()),
|
223
219
|
timeout=(5, None),
|
@@ -290,7 +286,7 @@ def status(
|
|
290
286
|
exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
291
287
|
"""
|
292
288
|
body = payloads.ServeStatusBody(service_names=service_names,)
|
293
|
-
response =
|
289
|
+
response = rest.post(
|
294
290
|
f'{server_common.get_server_url()}/serve/status',
|
295
291
|
json=json.loads(body.model_dump_json()),
|
296
292
|
timeout=(5, None),
|
@@ -301,6 +297,7 @@ def status(
|
|
301
297
|
|
302
298
|
@usage_lib.entrypoint
|
303
299
|
@server_common.check_server_healthy_or_start
|
300
|
+
@rest.retry_on_server_unavailable()
|
304
301
|
def tail_logs(service_name: str,
|
305
302
|
target: Union[str, 'serve_utils.ServiceComponent'],
|
306
303
|
replica_id: Optional[int] = None,
|
@@ -376,7 +373,7 @@ def tail_logs(service_name: str,
|
|
376
373
|
replica_id=replica_id,
|
377
374
|
follow=follow,
|
378
375
|
)
|
379
|
-
response =
|
376
|
+
response = rest.post(
|
380
377
|
f'{server_common.get_server_url()}/serve/logs',
|
381
378
|
json=json.loads(body.model_dump_json()),
|
382
379
|
timeout=(5, None),
|
@@ -384,7 +381,10 @@ def tail_logs(service_name: str,
|
|
384
381
|
cookies=server_common.get_api_cookie_jar(),
|
385
382
|
)
|
386
383
|
request_id = server_common.get_request_id(response)
|
387
|
-
sdk.stream_response(request_id,
|
384
|
+
return sdk.stream_response(request_id=request_id,
|
385
|
+
response=response,
|
386
|
+
output_stream=output_stream,
|
387
|
+
resumable=True)
|
388
388
|
|
389
389
|
|
390
390
|
@usage_lib.entrypoint
|
@@ -436,7 +436,7 @@ def sync_down_logs(service_name: str,
|
|
436
436
|
targets=targets,
|
437
437
|
replica_ids=replica_ids,
|
438
438
|
)
|
439
|
-
response =
|
439
|
+
response = rest.post(
|
440
440
|
f'{server_common.get_server_url()}/serve/sync-down-logs',
|
441
441
|
json=json.loads(body.model_dump_json()),
|
442
442
|
timeout=(5, None),
|
sky/serve/server/core.py
CHANGED
@@ -28,6 +28,7 @@ from sky.utils import command_runner
|
|
28
28
|
from sky.utils import common
|
29
29
|
from sky.utils import common_utils
|
30
30
|
from sky.utils import controller_utils
|
31
|
+
from sky.utils import dag_utils
|
31
32
|
from sky.utils import rich_utils
|
32
33
|
from sky.utils import subprocess_utils
|
33
34
|
from sky.utils import ux_utils
|
@@ -139,10 +140,13 @@ def up(
|
|
139
140
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
140
141
|
|
141
142
|
serve_utils.validate_service_task(task)
|
143
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
144
|
+
dag.resolve_and_validate_volumes()
|
142
145
|
# Always apply the policy again here, even though it might have been applied
|
143
146
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
144
147
|
# and get the mutated config.
|
145
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
148
|
+
dag, mutated_user_config = admin_policy_utils.apply(dag)
|
149
|
+
dag.pre_mount_volumes()
|
146
150
|
task = dag.tasks[0]
|
147
151
|
|
148
152
|
with rich_utils.safe_status(
|
sky/server/common.py
CHANGED
@@ -9,11 +9,13 @@ import json
|
|
9
9
|
import os
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
|
+
import shutil
|
12
13
|
import subprocess
|
13
14
|
import sys
|
15
|
+
import tempfile
|
14
16
|
import time
|
15
17
|
import typing
|
16
|
-
from typing import Any, Dict, Literal, Optional, Tuple
|
18
|
+
from typing import Any, Dict, Literal, Optional, Tuple, Union
|
17
19
|
from urllib import parse
|
18
20
|
import uuid
|
19
21
|
|
@@ -27,6 +29,7 @@ from sky import skypilot_config
|
|
27
29
|
from sky.adaptors import common as adaptors_common
|
28
30
|
from sky.data import data_utils
|
29
31
|
from sky.server import constants as server_constants
|
32
|
+
from sky.server import rest
|
30
33
|
from sky.skylet import constants
|
31
34
|
from sky.usage import usage_lib
|
32
35
|
from sky.utils import annotations
|
@@ -240,9 +243,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
240
243
|
server_url = endpoint if endpoint is not None else get_server_url()
|
241
244
|
while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
|
242
245
|
try:
|
243
|
-
response =
|
244
|
-
|
245
|
-
|
246
|
+
response = rest.get(f'{server_url}/api/health',
|
247
|
+
timeout=2.5,
|
248
|
+
cookies=get_api_cookie_jar())
|
246
249
|
except requests.exceptions.Timeout:
|
247
250
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
248
251
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
@@ -327,6 +330,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
|
|
327
330
|
def _start_api_server(deploy: bool = False,
|
328
331
|
host: str = '127.0.0.1',
|
329
332
|
foreground: bool = False,
|
333
|
+
metrics: bool = False,
|
334
|
+
metrics_port: Optional[int] = None,
|
330
335
|
enable_basic_auth: bool = False):
|
331
336
|
"""Starts a SkyPilot API server locally."""
|
332
337
|
server_url = get_server_url(host)
|
@@ -357,10 +362,13 @@ def _start_api_server(deploy: bool = False,
|
|
357
362
|
args += ['--deploy']
|
358
363
|
if host is not None:
|
359
364
|
args += [f'--host={host}']
|
365
|
+
if metrics_port is not None:
|
366
|
+
args += [f'--metrics-port={metrics_port}']
|
360
367
|
|
361
368
|
if foreground:
|
362
369
|
# Replaces the current process with the API server
|
363
370
|
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
371
|
+
_set_metrics_env_var(os.environ, metrics, deploy)
|
364
372
|
if enable_basic_auth:
|
365
373
|
os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
366
374
|
os.execvp(args[0], args)
|
@@ -368,6 +376,10 @@ def _start_api_server(deploy: bool = False,
|
|
368
376
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
369
377
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
370
378
|
|
379
|
+
# For spawn mode, copy the environ to avoid polluting the SDK process.
|
380
|
+
server_env = os.environ.copy()
|
381
|
+
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
382
|
+
_set_metrics_env_var(server_env, metrics, deploy)
|
371
383
|
# Start the API server process in the background and don't wait for it.
|
372
384
|
# If this is called from a CLI invocation, we need
|
373
385
|
# start_new_session=True so that SIGINT on the CLI will not also kill
|
@@ -437,6 +449,26 @@ def _start_api_server(deploy: bool = False,
|
|
437
449
|
f'SkyPilot API server started. {dashboard_msg}'))
|
438
450
|
|
439
451
|
|
452
|
+
def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
|
453
|
+
deploy: bool):
|
454
|
+
"""Sets the metrics environment variables.
|
455
|
+
|
456
|
+
Args:
|
457
|
+
env: The environment variables to set.
|
458
|
+
metrics: Whether to enable metrics.
|
459
|
+
deploy: Whether the server is running in deploy mode, which means
|
460
|
+
multiple processes might be running.
|
461
|
+
"""
|
462
|
+
if metrics:
|
463
|
+
env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
|
464
|
+
if deploy:
|
465
|
+
metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
|
466
|
+
shutil.rmtree(metrics_dir, ignore_errors=True)
|
467
|
+
os.makedirs(metrics_dir, exist_ok=True)
|
468
|
+
# Refer to https://prometheus.github.io/client_python/multiprocess/
|
469
|
+
env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
|
470
|
+
|
471
|
+
|
440
472
|
def check_server_healthy(
|
441
473
|
endpoint: Optional[str] = None
|
442
474
|
) -> Tuple[Literal[
|
@@ -571,6 +603,8 @@ def get_skypilot_version_on_disk() -> str:
|
|
571
603
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
572
604
|
host: str = '127.0.0.1',
|
573
605
|
foreground: bool = False,
|
606
|
+
metrics: bool = False,
|
607
|
+
metrics_port: Optional[int] = None,
|
574
608
|
enable_basic_auth: bool = False):
|
575
609
|
api_server_status = None
|
576
610
|
try:
|
@@ -592,7 +626,8 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
592
626
|
# have started the server while we were waiting for the lock.
|
593
627
|
api_server_info = get_api_server_status(endpoint)
|
594
628
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
595
|
-
_start_api_server(deploy, host, foreground,
|
629
|
+
_start_api_server(deploy, host, foreground, metrics,
|
630
|
+
metrics_port, enable_basic_auth)
|
596
631
|
|
597
632
|
|
598
633
|
def check_server_healthy_or_start(func):
|
sky/server/constants.py
CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
|
|
7
7
|
# API server version, whenever there is a change in API server that requires a
|
8
8
|
# restart of the local API server or error out when the client does not match
|
9
9
|
# the server version.
|
10
|
-
API_VERSION = '
|
10
|
+
API_VERSION = '10'
|
11
11
|
|
12
12
|
# Prefix for API request names.
|
13
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
@@ -22,6 +22,10 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
|
|
22
22
|
# background.
|
23
23
|
CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
24
24
|
|
25
|
+
# The interval (seconds) for the volume status to be refreshed in the
|
26
|
+
# background.
|
27
|
+
VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
28
|
+
|
25
29
|
# Environment variable for a file path to the API cookie file.
|
26
30
|
# Keep in sync with websocket_proxy.py
|
27
31
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
sky/server/metrics.py
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
"""Instrumentation for the API server."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
|
7
|
+
import fastapi
|
8
|
+
from prometheus_client import generate_latest
|
9
|
+
from prometheus_client import multiprocess
|
10
|
+
import prometheus_client as prom
|
11
|
+
import starlette.middleware.base
|
12
|
+
import uvicorn
|
13
|
+
|
14
|
+
from sky import sky_logging
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
# Total number of API server requests, grouped by path, method, and status.
|
19
|
+
sky_apiserver_requests_total = prom.Counter(
|
20
|
+
'sky_apiserver_requests_total',
|
21
|
+
'Total number of API server requests',
|
22
|
+
['path', 'method', 'status'],
|
23
|
+
)
|
24
|
+
|
25
|
+
# Time spent processing API server requests, grouped by path, method, and
|
26
|
+
# status.
|
27
|
+
sky_apiserver_request_duration_seconds = prom.Histogram(
|
28
|
+
'sky_apiserver_request_duration_seconds',
|
29
|
+
'Time spent processing API server requests',
|
30
|
+
['path', 'method', 'status'],
|
31
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
|
32
|
+
float('inf')),
|
33
|
+
)
|
34
|
+
|
35
|
+
metrics_app = fastapi.FastAPI()
|
36
|
+
|
37
|
+
|
38
|
+
@metrics_app.get('/metrics')
|
39
|
+
async def metrics() -> fastapi.Response:
|
40
|
+
"""Expose aggregated Prometheus metrics from all worker processes."""
|
41
|
+
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
42
|
+
# In multiprocess mode, we need to collect metrics from all processes.
|
43
|
+
registry = prom.CollectorRegistry()
|
44
|
+
multiprocess.MultiProcessCollector(registry)
|
45
|
+
data = generate_latest(registry)
|
46
|
+
else:
|
47
|
+
data = generate_latest()
|
48
|
+
return fastapi.Response(content=data,
|
49
|
+
media_type=prom.CONTENT_TYPE_LATEST,
|
50
|
+
headers={'Cache-Control': 'no-cache'})
|
51
|
+
|
52
|
+
|
53
|
+
def run_metrics_server(host: str, port: int):
|
54
|
+
metrics_config = uvicorn.Config(
|
55
|
+
'sky.server.metrics:metrics_app',
|
56
|
+
host=host,
|
57
|
+
port=port,
|
58
|
+
workers=1,
|
59
|
+
)
|
60
|
+
metrics_server_instance = uvicorn.Server(metrics_config)
|
61
|
+
asyncio.run(metrics_server_instance.serve())
|
62
|
+
|
63
|
+
|
64
|
+
def _get_status_code_group(status_code: int) -> str:
|
65
|
+
"""Group status codes into classes (2xx, 5xx) to reduce cardinality."""
|
66
|
+
return f'{status_code // 100}xx'
|
67
|
+
|
68
|
+
|
69
|
+
def _is_streaming_api(path: str) -> bool:
|
70
|
+
"""Check if the path is a streaming API."""
|
71
|
+
path = path.rstrip('/')
|
72
|
+
return path.endswith('/logs') or path.endswith('/api/stream')
|
73
|
+
|
74
|
+
|
75
|
+
class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
76
|
+
"""Middleware to collect Prometheus metrics for HTTP requests."""
|
77
|
+
|
78
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
79
|
+
path = request.url.path
|
80
|
+
logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
|
81
|
+
streaming = _is_streaming_api(path)
|
82
|
+
if not streaming:
|
83
|
+
# Exclude streaming APIs, the duration is not meaningful.
|
84
|
+
# TODO(aylei): measure the duration of async execution instead.
|
85
|
+
start_time = time.time()
|
86
|
+
method = request.method
|
87
|
+
status_code_group = ''
|
88
|
+
|
89
|
+
try:
|
90
|
+
response = await call_next(request)
|
91
|
+
status_code_group = _get_status_code_group(response.status_code)
|
92
|
+
except Exception: # pylint: disable=broad-except
|
93
|
+
status_code_group = '5xx'
|
94
|
+
raise
|
95
|
+
finally:
|
96
|
+
sky_apiserver_requests_total.labels(path=path,
|
97
|
+
method=method,
|
98
|
+
status=status_code_group).inc()
|
99
|
+
if not streaming:
|
100
|
+
duration = time.time() - start_time
|
101
|
+
sky_apiserver_request_duration_seconds.labels(
|
102
|
+
path=path, method=method,
|
103
|
+
status=status_code_group).observe(duration)
|
104
|
+
|
105
|
+
return response
|
sky/server/requests/executor.py
CHANGED
@@ -149,10 +149,25 @@ class RequestWorker:
|
|
149
149
|
self.schedule_type = schedule_type
|
150
150
|
self.garanteed_parallelism = config.garanteed_parallelism
|
151
151
|
self.burstable_parallelism = config.burstable_parallelism
|
152
|
+
self._thread: Optional[threading.Thread] = None
|
153
|
+
self._cancel_event = threading.Event()
|
152
154
|
|
153
155
|
def __str__(self) -> str:
|
154
156
|
return f'Worker(schedule_type={self.schedule_type.value})'
|
155
157
|
|
158
|
+
def run_in_background(self) -> None:
|
159
|
+
# Thread dispatcher is sufficient for current scale, refer to
|
160
|
+
# tests/load_tests/test_queue_dispatcher.py for more details.
|
161
|
+
# Use daemon thread for automatic cleanup.
|
162
|
+
thread = threading.Thread(target=self.run, daemon=True)
|
163
|
+
thread.start()
|
164
|
+
self._thread = thread
|
165
|
+
|
166
|
+
def cancel(self) -> None:
|
167
|
+
if self._thread is not None:
|
168
|
+
self._cancel_event.set()
|
169
|
+
self._thread.join()
|
170
|
+
|
156
171
|
def process_request(self, executor: process.BurstableExecutor,
|
157
172
|
queue: RequestQueue) -> None:
|
158
173
|
try:
|
@@ -219,7 +234,7 @@ class RequestWorker:
|
|
219
234
|
burst_workers=self.burstable_parallelism,
|
220
235
|
initializer=executor_initializer,
|
221
236
|
initargs=(proc_group,))
|
222
|
-
while
|
237
|
+
while not self._cancel_event.is_set():
|
223
238
|
self.process_request(executor, queue)
|
224
239
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
225
240
|
except KeyboardInterrupt:
|
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
|
|
539
554
|
enqueue()
|
540
555
|
|
541
556
|
|
542
|
-
def start(
|
557
|
+
def start(
|
558
|
+
config: server_config.ServerConfig
|
559
|
+
) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
|
543
560
|
"""Start the request workers.
|
544
561
|
|
545
562
|
Request workers run in background, schedule the requests and delegate the
|
546
563
|
request execution to executor processes.
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
A tuple of the queue server process and the list of request worker
|
567
|
+
threads.
|
547
568
|
"""
|
548
569
|
global queue_backend
|
549
570
|
queue_backend = config.queue_backend
|
550
|
-
|
571
|
+
queue_server = None
|
551
572
|
# Setup the queues.
|
552
573
|
if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
|
553
574
|
logger.info('Creating shared request queues')
|
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
564
585
|
queue_server = multiprocessing.Process(
|
565
586
|
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
566
587
|
queue_server.start()
|
567
|
-
sub_procs.append(queue_server)
|
568
588
|
mp_queue.wait_for_queues_to_be_ready(queue_names,
|
569
589
|
queue_server,
|
570
590
|
port=port)
|
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
577
597
|
|
578
598
|
logger.info('Request queues created')
|
579
599
|
|
580
|
-
|
581
|
-
# Thread dispatcher is sufficient for current scale, refer to
|
582
|
-
# tests/load_tests/test_queue_dispatcher.py for more details.
|
583
|
-
# Use daemon thread for automatic cleanup.
|
584
|
-
thread = threading.Thread(target=worker.run, daemon=True)
|
585
|
-
thread.start()
|
586
|
-
|
600
|
+
workers = []
|
587
601
|
# Start a worker for long requests.
|
588
602
|
long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
|
589
603
|
config=config.long_worker_config)
|
590
|
-
|
604
|
+
long_worker.run_in_background()
|
605
|
+
workers.append(long_worker)
|
591
606
|
|
592
607
|
# Start a worker for short requests.
|
593
608
|
short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
|
594
609
|
config=config.short_worker_config)
|
595
|
-
|
596
|
-
|
610
|
+
short_worker.run_in_background()
|
611
|
+
workers.append(short_worker)
|
612
|
+
return queue_server, workers
|
sky/server/requests/payloads.py
CHANGED
@@ -368,6 +368,22 @@ class StorageBody(RequestBody):
|
|
368
368
|
name: str
|
369
369
|
|
370
370
|
|
371
|
+
class VolumeApplyBody(RequestBody):
|
372
|
+
"""The request body for the volume apply endpoint."""
|
373
|
+
name: str
|
374
|
+
volume_type: str
|
375
|
+
cloud: str
|
376
|
+
region: Optional[str] = None
|
377
|
+
zone: Optional[str] = None
|
378
|
+
size: Optional[str] = None
|
379
|
+
config: Optional[Dict[str, Any]] = None
|
380
|
+
|
381
|
+
|
382
|
+
class VolumeDeleteBody(RequestBody):
|
383
|
+
"""The request body for the volume delete endpoint."""
|
384
|
+
names: List[str]
|
385
|
+
|
386
|
+
|
371
387
|
class EndpointsBody(RequestBody):
|
372
388
|
"""The request body for the endpoint."""
|
373
389
|
cluster: str
|