skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/runpod.py +68 -0
- sky/backends/backend_utils.py +5 -3
- sky/backends/cloud_vm_ray_backend.py +7 -2
- sky/client/cli/command.py +38 -6
- sky/client/sdk.py +22 -1
- sky/clouds/kubernetes.py +1 -1
- sky/clouds/nebius.py +4 -2
- sky/clouds/runpod.py +17 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +1 -0
- sky/dashboard/out/_next/static/chunks/{7205-88191679e7988c57.js → 1836-37fede578e2da5f8.js} +4 -9
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +6 -0
- sky/dashboard/out/_next/static/chunks/{3785.d5b86f6ebc88e6e6.js → 3785.4872a2f3aa489880.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4783.c485f48348349f47.js → 5339.3fda4a4010ff4e06.js} +4 -9
- sky/dashboard/out/_next/static/chunks/{9946.3b7b43c217ff70ec.js → 649.b9d7f7d10c1b8c53.js} +4 -9
- sky/dashboard/out/_next/static/chunks/6856-66e696640347e77b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +6 -0
- sky/dashboard/out/_next/static/chunks/9037-1c0101b86582136f.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-39c9bd4cdb7e5a57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-a0527109c2fab467.js → [cluster]-0b4b35dc1dfe046c.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-81351f95f3bec08e.js → [context]-6563820e094f68ca.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-c320641c2bcbbea6.js → infra-aabba60d57826e0f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-de06e613e20bc977.js → [name]-af76bb06dbb3954f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-be35b22e2046564c.js → workspaces-7598c33a746cdc91.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-24c4fc6d30ce0193.js +1 -0
- sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +29 -38
- sky/global_user_state.py +16 -1
- sky/jobs/state.py +1 -1
- sky/models.py +1 -0
- sky/provision/kubernetes/instance.py +10 -3
- sky/provision/runpod/__init__.py +3 -0
- sky/provision/runpod/instance.py +17 -0
- sky/provision/runpod/utils.py +23 -5
- sky/provision/runpod/volume.py +158 -0
- sky/serve/serve_state.py +1 -1
- sky/server/config.py +31 -3
- sky/server/requests/executor.py +9 -3
- sky/server/requests/payloads.py +7 -1
- sky/server/requests/preconditions.py +8 -7
- sky/server/requests/requests.py +132 -57
- sky/server/server.py +48 -38
- sky/server/stream_utils.py +14 -6
- sky/server/uvicorn.py +11 -4
- sky/skylet/constants.py +1 -1
- sky/skypilot_config.py +21 -9
- sky/ssh_node_pools/server.py +5 -5
- sky/templates/kubernetes-ray.yml.j2 +5 -5
- sky/templates/runpod-ray.yml.j2 +8 -0
- sky/users/server.py +18 -15
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/command_runner.py +4 -0
- sky/utils/db/db_utils.py +58 -1
- sky/utils/db/migration_utils.py +0 -16
- sky/utils/resource_checker.py +6 -5
- sky/utils/schemas.py +1 -1
- sky/utils/volume.py +3 -0
- sky/volumes/client/sdk.py +28 -0
- sky/volumes/server/server.py +11 -1
- sky/volumes/utils.py +117 -68
- sky/volumes/volume.py +98 -39
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/METADATA +34 -34
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/RECORD +86 -84
- sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-8089ed1e0b7e37fd.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-049014c6d43d127b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-89a84fd7fa31362d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0eaa6f7e63f51311.js +0 -1
- /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -308,7 +308,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
308
308
|
if _SQLALCHEMY_ENGINE is not None:
|
|
309
309
|
return _SQLALCHEMY_ENGINE
|
|
310
310
|
# get an engine to the db
|
|
311
|
-
engine =
|
|
311
|
+
engine = db_utils.get_engine('state')
|
|
312
312
|
|
|
313
313
|
# run migrations if needed
|
|
314
314
|
create_table(engine)
|
|
@@ -2312,3 +2312,18 @@ def set_system_config(config_key: str, config_value: str) -> None:
|
|
|
2312
2312
|
})
|
|
2313
2313
|
session.execute(upsert_stmnt)
|
|
2314
2314
|
session.commit()
|
|
2315
|
+
|
|
2316
|
+
|
|
2317
|
+
@_init_db
|
|
2318
|
+
def get_max_db_connections() -> Optional[int]:
|
|
2319
|
+
"""Get the maximum number of connections for the engine."""
|
|
2320
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2321
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2322
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2323
|
+
return None
|
|
2324
|
+
with sqlalchemy.orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2325
|
+
max_connections = session.execute(
|
|
2326
|
+
sqlalchemy.text('SHOW max_connections')).scalar()
|
|
2327
|
+
if max_connections is None:
|
|
2328
|
+
return None
|
|
2329
|
+
return int(max_connections)
|
sky/jobs/state.py
CHANGED
|
@@ -157,7 +157,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
157
157
|
if _SQLALCHEMY_ENGINE is not None:
|
|
158
158
|
return _SQLALCHEMY_ENGINE
|
|
159
159
|
# get an engine to the db
|
|
160
|
-
engine =
|
|
160
|
+
engine = db_utils.get_engine('spot_jobs')
|
|
161
161
|
|
|
162
162
|
# run migrations if needed
|
|
163
163
|
create_table(engine)
|
sky/models.py
CHANGED
|
@@ -1047,8 +1047,10 @@ def stop_instances(
|
|
|
1047
1047
|
raise NotImplementedError()
|
|
1048
1048
|
|
|
1049
1049
|
|
|
1050
|
-
def _delete_services(name_prefix: str,
|
|
1051
|
-
|
|
1050
|
+
def _delete_services(name_prefix: str,
|
|
1051
|
+
namespace: str,
|
|
1052
|
+
context: Optional[str],
|
|
1053
|
+
skip_ssh_service: bool = False) -> None:
|
|
1052
1054
|
"""Delete services with the given name prefix.
|
|
1053
1055
|
|
|
1054
1056
|
Args:
|
|
@@ -1057,7 +1059,9 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
|
1057
1059
|
context: Kubernetes context
|
|
1058
1060
|
"""
|
|
1059
1061
|
# TODO(andy): We should use tag for the service filter.
|
|
1060
|
-
|
|
1062
|
+
services = ([name_prefix, f'{name_prefix}-ssh']
|
|
1063
|
+
if not skip_ssh_service else [name_prefix])
|
|
1064
|
+
for service_name in services:
|
|
1061
1065
|
# Since we are not saving this lambda, it's a false positive.
|
|
1062
1066
|
# TODO(andyl): Wait for
|
|
1063
1067
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
|
@@ -1083,6 +1087,9 @@ def _terminate_node(namespace: str,
|
|
|
1083
1087
|
# Delete services for the head pod
|
|
1084
1088
|
# services are specified in sky/templates/kubernetes-ray.yml.j2
|
|
1085
1089
|
_delete_services(pod_name, namespace, context)
|
|
1090
|
+
else:
|
|
1091
|
+
# No ssh service is created for worker pods
|
|
1092
|
+
_delete_services(pod_name, namespace, context, skip_ssh_service=True)
|
|
1086
1093
|
|
|
1087
1094
|
# Note - delete pod after all other resources are deleted.
|
|
1088
1095
|
# This is to ensure there are no leftover resources if this down is run
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
|
|
|
9
9
|
from sky.provision.runpod.instance import stop_instances
|
|
10
10
|
from sky.provision.runpod.instance import terminate_instances
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
|
+
from sky.provision.runpod.volume import apply_volume
|
|
13
|
+
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_volume_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
80
80
|
created_instance_ids=[])
|
|
81
81
|
|
|
82
82
|
created_instance_ids = []
|
|
83
|
+
volume_mounts = config.node_config.get('VolumeMounts', [])
|
|
84
|
+
network_volume_id = None
|
|
85
|
+
volume_mount_path = None
|
|
86
|
+
if volume_mounts:
|
|
87
|
+
if len(volume_mounts) > 1:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod only supports one network volume mount, '
|
|
90
|
+
f'but {len(volume_mounts)} are specified. Only the first one '
|
|
91
|
+
f'will be used.')
|
|
92
|
+
volume_mount = volume_mounts[0]
|
|
93
|
+
network_volume_id = volume_mount.get('VolumeIdOnCloud')
|
|
94
|
+
volume_mount_path = volume_mount.get('MountPath')
|
|
95
|
+
if network_volume_id is None or volume_mount_path is None:
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
'Network volume ID and mount path must be specified.')
|
|
83
98
|
for _ in range(to_start_count):
|
|
84
99
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
85
100
|
try:
|
|
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
112
|
bid_per_gpu=config.node_config['BidPerGPU'],
|
|
98
113
|
docker_login_config=config.provider_config.get(
|
|
99
114
|
'docker_login_config'),
|
|
115
|
+
network_volume_id=network_volume_id,
|
|
116
|
+
volume_mount_path=volume_mount_path,
|
|
100
117
|
)
|
|
101
118
|
except Exception as e: # pylint: disable=broad-except
|
|
102
119
|
logger.warning(f'run_instances error: {e}')
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
|
|
|
263
263
|
return login_config.format_image(image_name), create_template_resp['id']
|
|
264
264
|
|
|
265
265
|
|
|
266
|
-
def launch(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
266
|
+
def launch(
|
|
267
|
+
cluster_name: str,
|
|
268
|
+
node_type: str,
|
|
269
|
+
instance_type: str,
|
|
270
|
+
region: str,
|
|
271
|
+
zone: str,
|
|
272
|
+
disk_size: int,
|
|
273
|
+
image_name: str,
|
|
274
|
+
ports: Optional[List[int]],
|
|
275
|
+
public_key: str,
|
|
276
|
+
preemptible: Optional[bool],
|
|
277
|
+
bid_per_gpu: float,
|
|
278
|
+
docker_login_config: Optional[Dict[str, str]],
|
|
279
|
+
*,
|
|
280
|
+
network_volume_id: Optional[str] = None,
|
|
281
|
+
volume_mount_path: Optional[str] = None,
|
|
282
|
+
) -> str:
|
|
271
283
|
"""Launches an instance with the given parameters.
|
|
272
284
|
|
|
273
285
|
For CPU instances, we directly use the instance_type for launching the
|
|
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
337
349
|
'template_id': template_id,
|
|
338
350
|
}
|
|
339
351
|
|
|
352
|
+
# Optional network volume mount.
|
|
353
|
+
if volume_mount_path is not None:
|
|
354
|
+
params['volume_mount_path'] = volume_mount_path
|
|
355
|
+
if network_volume_id is not None:
|
|
356
|
+
params['network_volume_id'] = network_volume_id
|
|
357
|
+
|
|
340
358
|
# GPU instance types start with f'{gpu_count}x',
|
|
341
359
|
# CPU instance types start with 'cpu'.
|
|
342
360
|
is_cpu_instance = instance_type.startswith('cpu')
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""RunPod network volume provisioning."""
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from sky import global_user_state
|
|
5
|
+
from sky import models
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import runpod
|
|
8
|
+
from sky.utils import common_utils
|
|
9
|
+
from sky.utils import volume as volume_lib
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _list_volumes() -> List[Dict[str, Any]]:
|
|
15
|
+
# GET /v1/networkvolumes returns a list
|
|
16
|
+
result = runpod.rest_request('GET', '/networkvolumes')
|
|
17
|
+
if isinstance(result, list):
|
|
18
|
+
return result
|
|
19
|
+
# Some deployments may wrap the list.
|
|
20
|
+
if isinstance(result, dict):
|
|
21
|
+
for key in ('items', 'data', 'networkVolumes'):
|
|
22
|
+
if key in result and isinstance(result[key], list):
|
|
23
|
+
return result[key]
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
28
|
+
"""Create or resolve a RunPod network volume via REST API.
|
|
29
|
+
|
|
30
|
+
If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
|
|
31
|
+
create a new one using POST /v1/networkvolumes.
|
|
32
|
+
"""
|
|
33
|
+
name_on_cloud = config.name_on_cloud
|
|
34
|
+
assert name_on_cloud is not None
|
|
35
|
+
|
|
36
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
37
|
+
if vol_id is None:
|
|
38
|
+
# Create new volume via REST
|
|
39
|
+
size = config.size
|
|
40
|
+
if size is None:
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
'RunPod network volume size must be specified to create '
|
|
43
|
+
'a volume.')
|
|
44
|
+
try:
|
|
45
|
+
size_int = int(size)
|
|
46
|
+
if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
|
|
47
|
+
raise RuntimeError(
|
|
48
|
+
f'RunPod network volume size must be at least '
|
|
49
|
+
f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
|
|
50
|
+
except Exception as e: # pylint: disable=broad-except
|
|
51
|
+
raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
|
|
52
|
+
data_center_id = config.zone
|
|
53
|
+
if not data_center_id:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
'RunPod DataCenterId is required to create a network '
|
|
56
|
+
'volume. Set the zone in the infra field.')
|
|
57
|
+
payload = {
|
|
58
|
+
'dataCenterId': data_center_id,
|
|
59
|
+
'name': name_on_cloud,
|
|
60
|
+
'size': size_int,
|
|
61
|
+
}
|
|
62
|
+
resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
|
|
63
|
+
if isinstance(resp, dict):
|
|
64
|
+
config.id_on_cloud = resp.get('id')
|
|
65
|
+
else:
|
|
66
|
+
raise RuntimeError(
|
|
67
|
+
f'Failed to create RunPod network volume: {resp}')
|
|
68
|
+
logger.info(f'Created RunPod network volume {name_on_cloud} '
|
|
69
|
+
f'(id={config.id_on_cloud})')
|
|
70
|
+
return config
|
|
71
|
+
|
|
72
|
+
# Use existing matched volume
|
|
73
|
+
config.id_on_cloud = vol_id
|
|
74
|
+
logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
|
|
75
|
+
f'(id={config.id_on_cloud})')
|
|
76
|
+
return config
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
80
|
+
"""Deletes a RunPod network volume via REST API if id is known or
|
|
81
|
+
resolvable. If the volume id is not known, try to resolve it by name.
|
|
82
|
+
"""
|
|
83
|
+
name_on_cloud = config.name_on_cloud
|
|
84
|
+
vol_id = config.id_on_cloud
|
|
85
|
+
if not vol_id:
|
|
86
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
87
|
+
if not vol_id:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod network volume id not found for {name_on_cloud}; '
|
|
90
|
+
f'skip delete')
|
|
91
|
+
return config
|
|
92
|
+
runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
|
|
93
|
+
logger.info(f'Deleted RunPod network volume {name_on_cloud} '
|
|
94
|
+
f'(id={vol_id})')
|
|
95
|
+
return config
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
|
|
99
|
+
vols = _list_volumes()
|
|
100
|
+
matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
|
|
101
|
+
if matched is not None:
|
|
102
|
+
return matched.get('id')
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_volume_usedby(
|
|
107
|
+
config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
|
|
108
|
+
"""Gets the clusters currently using this RunPod network volume.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
(usedby_pods, usedby_clusters)
|
|
112
|
+
usedby_clusters contains SkyPilot cluster display names inferred from
|
|
113
|
+
pod names, which may be wrong.
|
|
114
|
+
"""
|
|
115
|
+
vol_id = config.id_on_cloud
|
|
116
|
+
name_on_cloud = config.name_on_cloud
|
|
117
|
+
if vol_id is None:
|
|
118
|
+
vol_id = _try_resolve_volume_id(name_on_cloud)
|
|
119
|
+
if vol_id is None:
|
|
120
|
+
return [], []
|
|
121
|
+
|
|
122
|
+
# Query all pods for current user and filter by networkVolumeId
|
|
123
|
+
query = """
|
|
124
|
+
query Pods {
|
|
125
|
+
myself {
|
|
126
|
+
pods {
|
|
127
|
+
id
|
|
128
|
+
name
|
|
129
|
+
networkVolumeId
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
"""
|
|
134
|
+
resp = runpod.runpod.api.graphql.run_graphql_query(query)
|
|
135
|
+
pods = resp.get('data', {}).get('myself', {}).get('pods', [])
|
|
136
|
+
used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
|
|
137
|
+
usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
|
|
138
|
+
|
|
139
|
+
# Map pod names back to SkyPilot cluster names using heuristics.
|
|
140
|
+
clusters = global_user_state.get_clusters()
|
|
141
|
+
cluster_names: List[str] = []
|
|
142
|
+
user_hash = common_utils.get_user_hash()
|
|
143
|
+
for pod_name in usedby_pod_names:
|
|
144
|
+
matched = None
|
|
145
|
+
for c in clusters:
|
|
146
|
+
display = c.get('name')
|
|
147
|
+
if not display:
|
|
148
|
+
continue
|
|
149
|
+
# Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
|
|
150
|
+
# This can be wrong.
|
|
151
|
+
cluster_prefix = display + '-' + user_hash + '-'
|
|
152
|
+
if pod_name.startswith(cluster_prefix):
|
|
153
|
+
matched = display
|
|
154
|
+
break
|
|
155
|
+
if matched and matched not in cluster_names:
|
|
156
|
+
cluster_names.append(matched)
|
|
157
|
+
|
|
158
|
+
return usedby_pod_names, cluster_names
|
sky/serve/serve_state.py
CHANGED
|
@@ -130,7 +130,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
130
130
|
if _SQLALCHEMY_ENGINE is not None:
|
|
131
131
|
return _SQLALCHEMY_ENGINE
|
|
132
132
|
# get an engine to the db
|
|
133
|
-
engine =
|
|
133
|
+
engine = db_utils.get_engine('serve/services')
|
|
134
134
|
|
|
135
135
|
# run migrations if needed
|
|
136
136
|
create_table(engine)
|
sky/server/config.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky.server import constants as server_constants
|
|
@@ -61,6 +62,7 @@ class QueueBackend(enum.Enum):
|
|
|
61
62
|
class WorkerConfig:
|
|
62
63
|
garanteed_parallelism: int
|
|
63
64
|
burstable_parallelism: int
|
|
65
|
+
num_db_connections_per_worker: int
|
|
64
66
|
|
|
65
67
|
|
|
66
68
|
@dataclasses.dataclass
|
|
@@ -68,10 +70,13 @@ class ServerConfig:
|
|
|
68
70
|
num_server_workers: int
|
|
69
71
|
long_worker_config: WorkerConfig
|
|
70
72
|
short_worker_config: WorkerConfig
|
|
73
|
+
num_db_connections_per_worker: int
|
|
71
74
|
queue_backend: QueueBackend
|
|
72
75
|
|
|
73
76
|
|
|
74
|
-
def compute_server_config(deploy: bool
|
|
77
|
+
def compute_server_config(deploy: bool,
|
|
78
|
+
max_db_connections: Optional[int] = None
|
|
79
|
+
) -> ServerConfig:
|
|
75
80
|
"""Compute the server config based on environment.
|
|
76
81
|
|
|
77
82
|
We have different assumptions for the resources in different deployment
|
|
@@ -114,7 +119,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
114
119
|
queue_backend = QueueBackend.MULTIPROCESSING
|
|
115
120
|
burstable_parallel_for_long = 0
|
|
116
121
|
burstable_parallel_for_short = 0
|
|
122
|
+
# if num_db_connections_per_worker is 0, server will use NullPool
|
|
123
|
+
# to conserve the number of concurrent db connections.
|
|
124
|
+
# This could lead to performance degradation.
|
|
125
|
+
num_db_connections_per_worker = 0
|
|
117
126
|
num_server_workers = cpu_count
|
|
127
|
+
|
|
128
|
+
# +1 for the event loop running the main process
|
|
129
|
+
# and gc daemons in the '__main__' body of sky/server/server.py
|
|
130
|
+
max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
|
|
131
|
+
num_server_workers + 1)
|
|
132
|
+
|
|
118
133
|
if not deploy:
|
|
119
134
|
# For local mode, use local queue backend since we only run 1 uvicorn
|
|
120
135
|
# worker in local mode and no multiprocessing is needed.
|
|
@@ -140,6 +155,16 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
140
155
|
'SkyPilot API server will run in low resource mode because '
|
|
141
156
|
'the available memory is less than '
|
|
142
157
|
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
|
158
|
+
elif max_db_connections is not None:
|
|
159
|
+
if max_parallel_all_workers > max_db_connections:
|
|
160
|
+
logger.warning(
|
|
161
|
+
f'Max parallel all workers ({max_parallel_all_workers}) '
|
|
162
|
+
f'is greater than max db connections ({max_db_connections}). '
|
|
163
|
+
'Increase the number of max db connections to '
|
|
164
|
+
f'at least {max_parallel_all_workers} for optimal performance.')
|
|
165
|
+
else:
|
|
166
|
+
num_db_connections_per_worker = 1
|
|
167
|
+
|
|
143
168
|
logger.info(
|
|
144
169
|
f'SkyPilot API server will start {num_server_workers} server processes '
|
|
145
170
|
f'with {max_parallel_for_long} background workers for long requests '
|
|
@@ -150,10 +175,13 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
150
175
|
queue_backend=queue_backend,
|
|
151
176
|
long_worker_config=WorkerConfig(
|
|
152
177
|
garanteed_parallelism=max_parallel_for_long,
|
|
153
|
-
burstable_parallelism=burstable_parallel_for_long
|
|
178
|
+
burstable_parallelism=burstable_parallel_for_long,
|
|
179
|
+
num_db_connections_per_worker=num_db_connections_per_worker),
|
|
154
180
|
short_worker_config=WorkerConfig(
|
|
155
181
|
garanteed_parallelism=max_parallel_for_short,
|
|
156
|
-
burstable_parallelism=burstable_parallel_for_short
|
|
182
|
+
burstable_parallelism=burstable_parallel_for_short,
|
|
183
|
+
num_db_connections_per_worker=num_db_connections_per_worker),
|
|
184
|
+
num_db_connections_per_worker=num_db_connections_per_worker,
|
|
157
185
|
)
|
|
158
186
|
|
|
159
187
|
|
sky/server/requests/executor.py
CHANGED
|
@@ -57,6 +57,7 @@ from sky.utils import subprocess_utils
|
|
|
57
57
|
from sky.utils import tempstore
|
|
58
58
|
from sky.utils import timeline
|
|
59
59
|
from sky.utils import yaml_utils
|
|
60
|
+
from sky.utils.db import db_utils
|
|
60
61
|
from sky.workspaces import core as workspaces_core
|
|
61
62
|
|
|
62
63
|
if typing.TYPE_CHECKING:
|
|
@@ -152,6 +153,8 @@ class RequestWorker:
|
|
|
152
153
|
self.schedule_type = schedule_type
|
|
153
154
|
self.garanteed_parallelism = config.garanteed_parallelism
|
|
154
155
|
self.burstable_parallelism = config.burstable_parallelism
|
|
156
|
+
self.num_db_connections_per_worker = (
|
|
157
|
+
config.num_db_connections_per_worker)
|
|
155
158
|
self._thread: Optional[threading.Thread] = None
|
|
156
159
|
self._cancel_event = threading.Event()
|
|
157
160
|
|
|
@@ -190,8 +193,9 @@ class RequestWorker:
|
|
|
190
193
|
# multiple requests can share the same process pid, which may cause
|
|
191
194
|
# issues with SkyPilot core functions if they rely on the exit of
|
|
192
195
|
# the process, such as subprocess_daemon.py.
|
|
193
|
-
fut = executor.submit_until_success(
|
|
194
|
-
|
|
196
|
+
fut = executor.submit_until_success(
|
|
197
|
+
_request_execution_wrapper, request_id, ignore_return_value,
|
|
198
|
+
self.num_db_connections_per_worker)
|
|
195
199
|
# Monitor the result of the request execution.
|
|
196
200
|
threading.Thread(target=self.handle_task_result,
|
|
197
201
|
args=(fut, request_element),
|
|
@@ -351,7 +355,8 @@ def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
|
351
355
|
|
|
352
356
|
|
|
353
357
|
def _request_execution_wrapper(request_id: str,
|
|
354
|
-
ignore_return_value: bool
|
|
358
|
+
ignore_return_value: bool,
|
|
359
|
+
num_db_connections_per_worker: int = 0) -> None:
|
|
355
360
|
"""Wrapper for a request execution.
|
|
356
361
|
|
|
357
362
|
It wraps the execution of a request to:
|
|
@@ -362,6 +367,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
362
367
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
363
368
|
5. Maintain the lifecycle of the temp dir used by the request.
|
|
364
369
|
"""
|
|
370
|
+
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
365
371
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
366
372
|
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
367
373
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
|
|
|
309
309
|
cluster_names: Optional[List[str]] = None
|
|
310
310
|
refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
|
|
311
311
|
all_users: bool = True
|
|
312
|
-
|
|
312
|
+
# TODO (kyuds): default to False post 0.10.5
|
|
313
|
+
include_credentials: bool = True
|
|
313
314
|
|
|
314
315
|
|
|
315
316
|
class StartBody(RequestBody):
|
|
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
|
|
|
464
465
|
names: List[str]
|
|
465
466
|
|
|
466
467
|
|
|
468
|
+
class VolumeListBody(RequestBody):
|
|
469
|
+
"""The request body for the volume list endpoint."""
|
|
470
|
+
pass
|
|
471
|
+
|
|
472
|
+
|
|
467
473
|
class EndpointsBody(RequestBody):
|
|
468
474
|
"""The request body for the endpoint."""
|
|
469
475
|
cluster: str
|
|
@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
162
162
|
# We unify these situations into a single state: the process of starting
|
|
163
163
|
# the cluster is done (either normally or abnormally) but cluster is not
|
|
164
164
|
# in UP status.
|
|
165
|
-
requests = api_requests.
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
165
|
+
requests = await api_requests.get_request_tasks_async(
|
|
166
|
+
req_filter=api_requests.RequestTaskFilter(
|
|
167
|
+
status=[
|
|
168
|
+
api_requests.RequestStatus.RUNNING,
|
|
169
|
+
api_requests.RequestStatus.PENDING
|
|
170
|
+
],
|
|
171
|
+
include_request_names=['sky.launch', 'sky.start'],
|
|
172
|
+
cluster_names=[self.cluster_name]))
|
|
172
173
|
if len(requests) == 0:
|
|
173
174
|
# No running or pending tasks, the start process is done.
|
|
174
175
|
return True, None
|