skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250903__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/runpod.py +68 -0
- sky/backends/backend_utils.py +5 -3
- sky/client/cli/command.py +20 -5
- sky/clouds/kubernetes.py +1 -1
- sky/clouds/runpod.py +17 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-ec35954c8cbea535.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b77360a343d48902.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-0eaa6f7e63f51311.js → webpack-60556df644cd5d71.js} +1 -1
- sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +5 -2
- sky/models.py +1 -0
- sky/provision/runpod/__init__.py +3 -0
- sky/provision/runpod/instance.py +17 -0
- sky/provision/runpod/utils.py +23 -5
- sky/provision/runpod/volume.py +158 -0
- sky/server/requests/payloads.py +7 -1
- sky/server/requests/preconditions.py +8 -7
- sky/server/requests/requests.py +123 -57
- sky/server/server.py +32 -25
- sky/server/stream_utils.py +14 -6
- sky/server/uvicorn.py +2 -1
- sky/templates/kubernetes-ray.yml.j2 +5 -5
- sky/templates/runpod-ray.yml.j2 +8 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/command_runner.py +4 -0
- sky/utils/db/migration_utils.py +20 -4
- sky/utils/resource_checker.py +6 -5
- sky/utils/schemas.py +1 -1
- sky/utils/volume.py +3 -0
- sky/volumes/client/sdk.py +28 -0
- sky/volumes/server/server.py +11 -1
- sky/volumes/utils.py +117 -68
- sky/volumes/volume.py +98 -39
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/METADATA +34 -34
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/RECORD +57 -55
- sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
- /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-ec35954c8cbea535.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8089ed1e0b7e37fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -299,7 +299,9 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
299
299
|
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
300
300
|
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
301
301
|
# which could result in e1 being garbage collected unexpectedly.
|
|
302
|
-
def initialize_and_get_db(
|
|
302
|
+
def initialize_and_get_db(
|
|
303
|
+
pg_pool_class: Optional[sqlalchemy.pool.Pool] = None
|
|
304
|
+
) -> sqlalchemy.engine.Engine:
|
|
303
305
|
global _SQLALCHEMY_ENGINE
|
|
304
306
|
|
|
305
307
|
if _SQLALCHEMY_ENGINE is not None:
|
|
@@ -308,7 +310,8 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
308
310
|
if _SQLALCHEMY_ENGINE is not None:
|
|
309
311
|
return _SQLALCHEMY_ENGINE
|
|
310
312
|
# get an engine to the db
|
|
311
|
-
engine = migration_utils.get_engine('state'
|
|
313
|
+
engine = migration_utils.get_engine('state',
|
|
314
|
+
pg_pool_class=pg_pool_class)
|
|
312
315
|
|
|
313
316
|
# run migrations if needed
|
|
314
317
|
create_table(engine)
|
sky/models.py
CHANGED
sky/provision/runpod/__init__.py
CHANGED
|
@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
|
|
|
9
9
|
from sky.provision.runpod.instance import stop_instances
|
|
10
10
|
from sky.provision.runpod.instance import terminate_instances
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
|
+
from sky.provision.runpod.volume import apply_volume
|
|
13
|
+
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_volume_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
80
80
|
created_instance_ids=[])
|
|
81
81
|
|
|
82
82
|
created_instance_ids = []
|
|
83
|
+
volume_mounts = config.node_config.get('VolumeMounts', [])
|
|
84
|
+
network_volume_id = None
|
|
85
|
+
volume_mount_path = None
|
|
86
|
+
if volume_mounts:
|
|
87
|
+
if len(volume_mounts) > 1:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod only supports one network volume mount, '
|
|
90
|
+
f'but {len(volume_mounts)} are specified. Only the first one '
|
|
91
|
+
f'will be used.')
|
|
92
|
+
volume_mount = volume_mounts[0]
|
|
93
|
+
network_volume_id = volume_mount.get('VolumeIdOnCloud')
|
|
94
|
+
volume_mount_path = volume_mount.get('MountPath')
|
|
95
|
+
if network_volume_id is None or volume_mount_path is None:
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
'Network volume ID and mount path must be specified.')
|
|
83
98
|
for _ in range(to_start_count):
|
|
84
99
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
85
100
|
try:
|
|
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
112
|
bid_per_gpu=config.node_config['BidPerGPU'],
|
|
98
113
|
docker_login_config=config.provider_config.get(
|
|
99
114
|
'docker_login_config'),
|
|
115
|
+
network_volume_id=network_volume_id,
|
|
116
|
+
volume_mount_path=volume_mount_path,
|
|
100
117
|
)
|
|
101
118
|
except Exception as e: # pylint: disable=broad-except
|
|
102
119
|
logger.warning(f'run_instances error: {e}')
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
|
|
|
263
263
|
return login_config.format_image(image_name), create_template_resp['id']
|
|
264
264
|
|
|
265
265
|
|
|
266
|
-
def launch(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
266
|
+
def launch(
|
|
267
|
+
cluster_name: str,
|
|
268
|
+
node_type: str,
|
|
269
|
+
instance_type: str,
|
|
270
|
+
region: str,
|
|
271
|
+
zone: str,
|
|
272
|
+
disk_size: int,
|
|
273
|
+
image_name: str,
|
|
274
|
+
ports: Optional[List[int]],
|
|
275
|
+
public_key: str,
|
|
276
|
+
preemptible: Optional[bool],
|
|
277
|
+
bid_per_gpu: float,
|
|
278
|
+
docker_login_config: Optional[Dict[str, str]],
|
|
279
|
+
*,
|
|
280
|
+
network_volume_id: Optional[str] = None,
|
|
281
|
+
volume_mount_path: Optional[str] = None,
|
|
282
|
+
) -> str:
|
|
271
283
|
"""Launches an instance with the given parameters.
|
|
272
284
|
|
|
273
285
|
For CPU instances, we directly use the instance_type for launching the
|
|
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
337
349
|
'template_id': template_id,
|
|
338
350
|
}
|
|
339
351
|
|
|
352
|
+
# Optional network volume mount.
|
|
353
|
+
if volume_mount_path is not None:
|
|
354
|
+
params['volume_mount_path'] = volume_mount_path
|
|
355
|
+
if network_volume_id is not None:
|
|
356
|
+
params['network_volume_id'] = network_volume_id
|
|
357
|
+
|
|
340
358
|
# GPU instance types start with f'{gpu_count}x',
|
|
341
359
|
# CPU instance types start with 'cpu'.
|
|
342
360
|
is_cpu_instance = instance_type.startswith('cpu')
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""RunPod network volume provisioning."""
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from sky import global_user_state
|
|
5
|
+
from sky import models
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import runpod
|
|
8
|
+
from sky.utils import common_utils
|
|
9
|
+
from sky.utils import volume as volume_lib
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _list_volumes() -> List[Dict[str, Any]]:
|
|
15
|
+
# GET /v1/networkvolumes returns a list
|
|
16
|
+
result = runpod.rest_request('GET', '/networkvolumes')
|
|
17
|
+
if isinstance(result, list):
|
|
18
|
+
return result
|
|
19
|
+
# Some deployments may wrap the list.
|
|
20
|
+
if isinstance(result, dict):
|
|
21
|
+
for key in ('items', 'data', 'networkVolumes'):
|
|
22
|
+
if key in result and isinstance(result[key], list):
|
|
23
|
+
return result[key]
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
28
|
+
"""Create or resolve a RunPod network volume via REST API.
|
|
29
|
+
|
|
30
|
+
If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
|
|
31
|
+
create a new one using POST /v1/networkvolumes.
|
|
32
|
+
"""
|
|
33
|
+
name_on_cloud = config.name_on_cloud
|
|
34
|
+
assert name_on_cloud is not None
|
|
35
|
+
|
|
36
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
37
|
+
if vol_id is None:
|
|
38
|
+
# Create new volume via REST
|
|
39
|
+
size = config.size
|
|
40
|
+
if size is None:
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
'RunPod network volume size must be specified to create '
|
|
43
|
+
'a volume.')
|
|
44
|
+
try:
|
|
45
|
+
size_int = int(size)
|
|
46
|
+
if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
|
|
47
|
+
raise RuntimeError(
|
|
48
|
+
f'RunPod network volume size must be at least '
|
|
49
|
+
f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
|
|
50
|
+
except Exception as e: # pylint: disable=broad-except
|
|
51
|
+
raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
|
|
52
|
+
data_center_id = config.zone
|
|
53
|
+
if not data_center_id:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
'RunPod DataCenterId is required to create a network '
|
|
56
|
+
'volume. Set the zone in the infra field.')
|
|
57
|
+
payload = {
|
|
58
|
+
'dataCenterId': data_center_id,
|
|
59
|
+
'name': name_on_cloud,
|
|
60
|
+
'size': size_int,
|
|
61
|
+
}
|
|
62
|
+
resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
|
|
63
|
+
if isinstance(resp, dict):
|
|
64
|
+
config.id_on_cloud = resp.get('id')
|
|
65
|
+
else:
|
|
66
|
+
raise RuntimeError(
|
|
67
|
+
f'Failed to create RunPod network volume: {resp}')
|
|
68
|
+
logger.info(f'Created RunPod network volume {name_on_cloud} '
|
|
69
|
+
f'(id={config.id_on_cloud})')
|
|
70
|
+
return config
|
|
71
|
+
|
|
72
|
+
# Use existing matched volume
|
|
73
|
+
config.id_on_cloud = vol_id
|
|
74
|
+
logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
|
|
75
|
+
f'(id={config.id_on_cloud})')
|
|
76
|
+
return config
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
80
|
+
"""Deletes a RunPod network volume via REST API if id is known or
|
|
81
|
+
resolvable. If the volume id is not known, try to resolve it by name.
|
|
82
|
+
"""
|
|
83
|
+
name_on_cloud = config.name_on_cloud
|
|
84
|
+
vol_id = config.id_on_cloud
|
|
85
|
+
if not vol_id:
|
|
86
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
87
|
+
if not vol_id:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod network volume id not found for {name_on_cloud}; '
|
|
90
|
+
f'skip delete')
|
|
91
|
+
return config
|
|
92
|
+
runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
|
|
93
|
+
logger.info(f'Deleted RunPod network volume {name_on_cloud} '
|
|
94
|
+
f'(id={vol_id})')
|
|
95
|
+
return config
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
|
|
99
|
+
vols = _list_volumes()
|
|
100
|
+
matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
|
|
101
|
+
if matched is not None:
|
|
102
|
+
return matched.get('id')
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_volume_usedby(
|
|
107
|
+
config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
|
|
108
|
+
"""Gets the clusters currently using this RunPod network volume.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
(usedby_pods, usedby_clusters)
|
|
112
|
+
usedby_clusters contains SkyPilot cluster display names inferred from
|
|
113
|
+
pod names, which may be wrong.
|
|
114
|
+
"""
|
|
115
|
+
vol_id = config.id_on_cloud
|
|
116
|
+
name_on_cloud = config.name_on_cloud
|
|
117
|
+
if vol_id is None:
|
|
118
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
119
|
+
if vol_id is None:
|
|
120
|
+
return [], []
|
|
121
|
+
|
|
122
|
+
# Query all pods for current user and filter by networkVolumeId
|
|
123
|
+
query = """
|
|
124
|
+
query Pods {
|
|
125
|
+
myself {
|
|
126
|
+
pods {
|
|
127
|
+
id
|
|
128
|
+
name
|
|
129
|
+
networkVolumeId
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
"""
|
|
134
|
+
resp = runpod.runpod.api.graphql.run_graphql_query(query)
|
|
135
|
+
pods = resp.get('data', {}).get('myself', {}).get('pods', [])
|
|
136
|
+
used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
|
|
137
|
+
usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
|
|
138
|
+
|
|
139
|
+
# Map pod names back to SkyPilot cluster names using heuristics.
|
|
140
|
+
clusters = global_user_state.get_clusters()
|
|
141
|
+
cluster_names: List[str] = []
|
|
142
|
+
user_hash = common_utils.get_user_hash()
|
|
143
|
+
for pod_name in usedby_pod_names:
|
|
144
|
+
matched = None
|
|
145
|
+
for c in clusters:
|
|
146
|
+
display = c.get('name')
|
|
147
|
+
if not display:
|
|
148
|
+
continue
|
|
149
|
+
# Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
|
|
150
|
+
# This can be wrong.
|
|
151
|
+
cluster_prefix = display + '-' + user_hash + '-'
|
|
152
|
+
if pod_name.startswith(cluster_prefix):
|
|
153
|
+
matched = display
|
|
154
|
+
break
|
|
155
|
+
if matched and matched not in cluster_names:
|
|
156
|
+
cluster_names.append(matched)
|
|
157
|
+
|
|
158
|
+
return usedby_pod_names, cluster_names
|
sky/server/requests/payloads.py
CHANGED
|
@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
|
|
|
309
309
|
cluster_names: Optional[List[str]] = None
|
|
310
310
|
refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
|
|
311
311
|
all_users: bool = True
|
|
312
|
-
|
|
312
|
+
# TODO (kyuds): default to False post 0.10.5
|
|
313
|
+
include_credentials: bool = True
|
|
313
314
|
|
|
314
315
|
|
|
315
316
|
class StartBody(RequestBody):
|
|
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
|
|
|
464
465
|
names: List[str]
|
|
465
466
|
|
|
466
467
|
|
|
468
|
+
class VolumeListBody(RequestBody):
|
|
469
|
+
"""The request body for the volume list endpoint."""
|
|
470
|
+
pass
|
|
471
|
+
|
|
472
|
+
|
|
467
473
|
class EndpointsBody(RequestBody):
|
|
468
474
|
"""The request body for the endpoint."""
|
|
469
475
|
cluster: str
|
|
@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
162
162
|
# We unify these situations into a single state: the process of starting
|
|
163
163
|
# the cluster is done (either normally or abnormally) but cluster is not
|
|
164
164
|
# in UP status.
|
|
165
|
-
requests = api_requests.
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
165
|
+
requests = await api_requests.get_request_tasks_async(
|
|
166
|
+
req_filter=api_requests.RequestTaskFilter(
|
|
167
|
+
status=[
|
|
168
|
+
api_requests.RequestStatus.RUNNING,
|
|
169
|
+
api_requests.RequestStatus.PENDING
|
|
170
|
+
],
|
|
171
|
+
include_request_names=['sky.launch', 'sky.start'],
|
|
172
|
+
cluster_names=[self.cluster_name]))
|
|
172
173
|
if len(requests) == 0:
|
|
173
174
|
# No running or pending tasks, the start process is done.
|
|
174
175
|
return True, None
|
sky/server/requests/requests.py
CHANGED
|
@@ -14,7 +14,7 @@ import threading
|
|
|
14
14
|
import time
|
|
15
15
|
import traceback
|
|
16
16
|
from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
|
|
17
|
-
Optional, Tuple)
|
|
17
|
+
NamedTuple, Optional, Tuple)
|
|
18
18
|
|
|
19
19
|
import colorama
|
|
20
20
|
import filelock
|
|
@@ -300,10 +300,11 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
300
300
|
prevent killing the caller request.
|
|
301
301
|
"""
|
|
302
302
|
request_ids = [
|
|
303
|
-
request_task.request_id
|
|
303
|
+
request_task.request_id
|
|
304
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
304
305
|
cluster_names=[cluster_name],
|
|
305
306
|
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
306
|
-
exclude_request_names=[exclude_request_name])
|
|
307
|
+
exclude_request_names=[exclude_request_name]))
|
|
307
308
|
]
|
|
308
309
|
kill_requests(request_ids)
|
|
309
310
|
|
|
@@ -323,11 +324,12 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
323
324
|
"""
|
|
324
325
|
if request_ids is None:
|
|
325
326
|
request_ids = [
|
|
326
|
-
request_task.request_id
|
|
327
|
+
request_task.request_id
|
|
328
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
327
329
|
user_id=user_id,
|
|
328
330
|
status=[RequestStatus.RUNNING, RequestStatus.PENDING],
|
|
329
331
|
# Avoid cancelling the cancel request itself.
|
|
330
|
-
exclude_request_names=['sky.api_cancel'])
|
|
332
|
+
exclude_request_names=['sky.api_cancel']))
|
|
331
333
|
]
|
|
332
334
|
cancelled_request_ids = []
|
|
333
335
|
for request_id in request_ids:
|
|
@@ -548,6 +550,40 @@ async def get_request_async(request_id: str) -> Optional[Request]:
|
|
|
548
550
|
return await _get_request_no_lock_async(request_id)
|
|
549
551
|
|
|
550
552
|
|
|
553
|
+
class StatusWithMsg(NamedTuple):
|
|
554
|
+
status: RequestStatus
|
|
555
|
+
status_msg: Optional[str] = None
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
@init_db_async
|
|
559
|
+
@metrics_lib.time_me_async
|
|
560
|
+
async def get_request_status_async(
|
|
561
|
+
request_id: str,
|
|
562
|
+
include_msg: bool = False,
|
|
563
|
+
) -> Optional[StatusWithMsg]:
|
|
564
|
+
"""Get the status of a request.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
request_id: The ID of the request.
|
|
568
|
+
include_msg: Whether to include the status message.
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
The status of the request. If the request is not found, returns
|
|
572
|
+
None.
|
|
573
|
+
"""
|
|
574
|
+
assert _DB is not None
|
|
575
|
+
columns = 'status'
|
|
576
|
+
if include_msg:
|
|
577
|
+
columns += ', status_msg'
|
|
578
|
+
sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
|
|
579
|
+
async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
|
|
580
|
+
if rows is None or len(rows) == 0:
|
|
581
|
+
return None
|
|
582
|
+
status = RequestStatus(rows[0][0])
|
|
583
|
+
status_msg = rows[0][1] if include_msg else None
|
|
584
|
+
return StatusWithMsg(status, status_msg)
|
|
585
|
+
|
|
586
|
+
|
|
551
587
|
@init_db
|
|
552
588
|
@metrics_lib.time_me
|
|
553
589
|
def create_if_not_exists(request: Request) -> bool:
|
|
@@ -570,17 +606,9 @@ async def create_if_not_exists_async(request: Request) -> bool:
|
|
|
570
606
|
return True
|
|
571
607
|
|
|
572
608
|
|
|
573
|
-
@
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
status: Optional[List[RequestStatus]] = None,
|
|
577
|
-
cluster_names: Optional[List[str]] = None,
|
|
578
|
-
user_id: Optional[str] = None,
|
|
579
|
-
exclude_request_names: Optional[List[str]] = None,
|
|
580
|
-
include_request_names: Optional[List[str]] = None,
|
|
581
|
-
finished_before: Optional[float] = None,
|
|
582
|
-
) -> List[Request]:
|
|
583
|
-
"""Get a list of requests that match the given filters.
|
|
609
|
+
@dataclasses.dataclass
|
|
610
|
+
class RequestTaskFilter:
|
|
611
|
+
"""Filter for requests.
|
|
584
612
|
|
|
585
613
|
Args:
|
|
586
614
|
status: a list of statuses of the requests to filter on.
|
|
@@ -598,51 +626,87 @@ def get_request_tasks(
|
|
|
598
626
|
ValueError: If both exclude_request_names and include_request_names are
|
|
599
627
|
provided.
|
|
600
628
|
"""
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
629
|
+
status: Optional[List[RequestStatus]] = None
|
|
630
|
+
cluster_names: Optional[List[str]] = None
|
|
631
|
+
user_id: Optional[str] = None
|
|
632
|
+
exclude_request_names: Optional[List[str]] = None
|
|
633
|
+
include_request_names: Optional[List[str]] = None
|
|
634
|
+
finished_before: Optional[float] = None
|
|
635
|
+
|
|
636
|
+
def __post_init__(self):
|
|
637
|
+
if (self.exclude_request_names is not None and
|
|
638
|
+
self.include_request_names is not None):
|
|
639
|
+
raise ValueError(
|
|
640
|
+
'Only one of exclude_request_names or include_request_names '
|
|
641
|
+
'can be provided, not both.')
|
|
642
|
+
|
|
643
|
+
def build_query(self) -> Tuple[str, List[Any]]:
|
|
644
|
+
"""Build the SQL query and filter parameters.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
A tuple of (SQL, SQL parameters).
|
|
648
|
+
"""
|
|
649
|
+
filters = []
|
|
650
|
+
filter_params: List[Any] = []
|
|
651
|
+
if self.status is not None:
|
|
652
|
+
status_list_str = ','.join(
|
|
653
|
+
repr(status.value) for status in self.status)
|
|
654
|
+
filters.append(f'status IN ({status_list_str})')
|
|
655
|
+
if self.exclude_request_names is not None:
|
|
656
|
+
exclude_request_names_str = ','.join(
|
|
657
|
+
repr(name) for name in self.exclude_request_names)
|
|
658
|
+
filters.append(f'name NOT IN ({exclude_request_names_str})')
|
|
659
|
+
if self.cluster_names is not None:
|
|
660
|
+
cluster_names_str = ','.join(
|
|
661
|
+
repr(name) for name in self.cluster_names)
|
|
662
|
+
filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
|
|
663
|
+
if self.user_id is not None:
|
|
664
|
+
filters.append(f'{COL_USER_ID} = ?')
|
|
665
|
+
filter_params.append(self.user_id)
|
|
666
|
+
if self.include_request_names is not None:
|
|
667
|
+
request_names_str = ','.join(
|
|
668
|
+
repr(name) for name in self.include_request_names)
|
|
669
|
+
filters.append(f'name IN ({request_names_str})')
|
|
670
|
+
if self.finished_before is not None:
|
|
671
|
+
filters.append('finished_at < ?')
|
|
672
|
+
filter_params.append(self.finished_before)
|
|
631
673
|
filter_str = ' AND '.join(filters)
|
|
632
674
|
if filter_str:
|
|
633
675
|
filter_str = f' WHERE {filter_str}'
|
|
634
676
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
677
|
+
return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
|
|
678
|
+
'ORDER BY created_at DESC'), filter_params
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@init_db
|
|
682
|
+
@metrics_lib.time_me
|
|
683
|
+
def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
|
|
684
|
+
"""Get a list of requests that match the given filters.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
req_filter: the filter to apply to the requests. Refer to
|
|
688
|
+
RequestTaskFilter for the details.
|
|
689
|
+
"""
|
|
690
|
+
assert _DB is not None
|
|
691
|
+
with _DB.conn:
|
|
692
|
+
cursor = _DB.conn.cursor()
|
|
693
|
+
cursor.execute(*req_filter.build_query())
|
|
638
694
|
rows = cursor.fetchall()
|
|
639
695
|
if rows is None:
|
|
640
696
|
return []
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
697
|
+
return [Request.from_row(row) for row in rows]
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
@init_db_async
|
|
701
|
+
@metrics_lib.time_me_async
|
|
702
|
+
async def get_request_tasks_async(
|
|
703
|
+
req_filter: RequestTaskFilter) -> List[Request]:
|
|
704
|
+
"""Async version of get_request_tasks."""
|
|
705
|
+
assert _DB is not None
|
|
706
|
+
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
707
|
+
if not rows:
|
|
708
|
+
return []
|
|
709
|
+
return [Request.from_row(row) for row in rows]
|
|
646
710
|
|
|
647
711
|
|
|
648
712
|
@init_db_async
|
|
@@ -739,8 +803,10 @@ def clean_finished_requests_with_retention(retention_seconds: int):
|
|
|
739
803
|
retention_seconds: Requests older than this many seconds will be
|
|
740
804
|
deleted.
|
|
741
805
|
"""
|
|
742
|
-
reqs = get_request_tasks(
|
|
743
|
-
|
|
806
|
+
reqs = get_request_tasks(
|
|
807
|
+
req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
|
|
808
|
+
finished_before=time.time() -
|
|
809
|
+
retention_seconds))
|
|
744
810
|
|
|
745
811
|
subprocess_utils.run_in_parallel(
|
|
746
812
|
func=lambda req: req.log_path.unlink(missing_ok=True),
|
|
@@ -767,7 +833,7 @@ async def requests_gc_daemon():
|
|
|
767
833
|
try:
|
|
768
834
|
# Negative value disables the requests GC
|
|
769
835
|
if retention_seconds >= 0:
|
|
770
|
-
clean_finished_requests_with_retention(retention_seconds)
|
|
836
|
+
await clean_finished_requests_with_retention(retention_seconds)
|
|
771
837
|
except asyncio.CancelledError:
|
|
772
838
|
logger.info('Requests GC daemon cancelled')
|
|
773
839
|
break
|