skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/runpod.py +68 -0
  3. sky/backends/backend_utils.py +5 -3
  4. sky/backends/cloud_vm_ray_backend.py +7 -2
  5. sky/client/cli/command.py +38 -6
  6. sky/client/sdk.py +22 -1
  7. sky/clouds/kubernetes.py +1 -1
  8. sky/clouds/nebius.py +4 -2
  9. sky/clouds/runpod.py +17 -0
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/{7205-88191679e7988c57.js → 1836-37fede578e2da5f8.js} +4 -9
  13. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/{3785.d5b86f6ebc88e6e6.js → 3785.4872a2f3aa489880.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/{4783.c485f48348349f47.js → 5339.3fda4a4010ff4e06.js} +4 -9
  17. sky/dashboard/out/_next/static/chunks/{9946.3b7b43c217ff70ec.js → 649.b9d7f7d10c1b8c53.js} +4 -9
  18. sky/dashboard/out/_next/static/chunks/6856-66e696640347e77b.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/9037-1c0101b86582136f.js +6 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-39c9bd4cdb7e5a57.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-a0527109c2fab467.js → [cluster]-0b4b35dc1dfe046c.js} +2 -7
  23. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-81351f95f3bec08e.js → [context]-6563820e094f68ca.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{infra-c320641c2bcbbea6.js → infra-aabba60d57826e0f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-de06e613e20bc977.js → [name]-af76bb06dbb3954f.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/{workspaces-be35b22e2046564c.js → workspaces-7598c33a746cdc91.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/webpack-24c4fc6d30ce0193.js +1 -0
  29. sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_buildManifest.js +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/mounting_utils.py +29 -38
  46. sky/global_user_state.py +16 -1
  47. sky/jobs/state.py +1 -1
  48. sky/models.py +1 -0
  49. sky/provision/kubernetes/instance.py +10 -3
  50. sky/provision/runpod/__init__.py +3 -0
  51. sky/provision/runpod/instance.py +17 -0
  52. sky/provision/runpod/utils.py +23 -5
  53. sky/provision/runpod/volume.py +158 -0
  54. sky/serve/serve_state.py +1 -1
  55. sky/server/config.py +31 -3
  56. sky/server/requests/executor.py +9 -3
  57. sky/server/requests/payloads.py +7 -1
  58. sky/server/requests/preconditions.py +8 -7
  59. sky/server/requests/requests.py +132 -57
  60. sky/server/server.py +48 -38
  61. sky/server/stream_utils.py +14 -6
  62. sky/server/uvicorn.py +11 -4
  63. sky/skylet/constants.py +1 -1
  64. sky/skypilot_config.py +21 -9
  65. sky/ssh_node_pools/server.py +5 -5
  66. sky/templates/kubernetes-ray.yml.j2 +5 -5
  67. sky/templates/runpod-ray.yml.j2 +8 -0
  68. sky/users/server.py +18 -15
  69. sky/utils/benchmark_utils.py +60 -0
  70. sky/utils/command_runner.py +4 -0
  71. sky/utils/db/db_utils.py +58 -1
  72. sky/utils/db/migration_utils.py +0 -16
  73. sky/utils/resource_checker.py +6 -5
  74. sky/utils/schemas.py +1 -1
  75. sky/utils/volume.py +3 -0
  76. sky/volumes/client/sdk.py +28 -0
  77. sky/volumes/server/server.py +11 -1
  78. sky/volumes/utils.py +117 -68
  79. sky/volumes/volume.py +98 -39
  80. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/METADATA +34 -34
  81. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/RECORD +86 -84
  82. sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/3015-8089ed1e0b7e37fd.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-049014c6d43d127b.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +0 -6
  86. sky/dashboard/out/_next/static/chunks/9037-89a84fd7fa31362d.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +0 -6
  88. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
  89. sky/dashboard/out/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/webpack-0eaa6f7e63f51311.js +0 -1
  91. /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_ssgManifest.js +0 -0
  92. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -308,7 +308,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
308
308
  if _SQLALCHEMY_ENGINE is not None:
309
309
  return _SQLALCHEMY_ENGINE
310
310
  # get an engine to the db
311
- engine = migration_utils.get_engine('state')
311
+ engine = db_utils.get_engine('state')
312
312
 
313
313
  # run migrations if needed
314
314
  create_table(engine)
@@ -2312,3 +2312,18 @@ def set_system_config(config_key: str, config_value: str) -> None:
2312
2312
  })
2313
2313
  session.execute(upsert_stmnt)
2314
2314
  session.commit()
2315
+
2316
+
2317
+ @_init_db
2318
+ def get_max_db_connections() -> Optional[int]:
2319
+ """Get the maximum number of connections for the engine."""
2320
+ assert _SQLALCHEMY_ENGINE is not None
2321
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2322
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2323
+ return None
2324
+ with sqlalchemy.orm.Session(_SQLALCHEMY_ENGINE) as session:
2325
+ max_connections = session.execute(
2326
+ sqlalchemy.text('SHOW max_connections')).scalar()
2327
+ if max_connections is None:
2328
+ return None
2329
+ return int(max_connections)
sky/jobs/state.py CHANGED
@@ -157,7 +157,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
157
157
  if _SQLALCHEMY_ENGINE is not None:
158
158
  return _SQLALCHEMY_ENGINE
159
159
  # get an engine to the db
160
- engine = migration_utils.get_engine('spot_jobs')
160
+ engine = db_utils.get_engine('spot_jobs')
161
161
 
162
162
  # run migrations if needed
163
163
  create_table(engine)
sky/models.py CHANGED
@@ -109,3 +109,4 @@ class VolumeConfig(pydantic.BaseModel):
109
109
  size: Optional[str]
110
110
  config: Dict[str, Any] = {}
111
111
  labels: Optional[Dict[str, str]] = None
112
+ id_on_cloud: Optional[str] = None
@@ -1047,8 +1047,10 @@ def stop_instances(
1047
1047
  raise NotImplementedError()
1048
1048
 
1049
1049
 
1050
- def _delete_services(name_prefix: str, namespace: str,
1051
- context: Optional[str]) -> None:
1050
+ def _delete_services(name_prefix: str,
1051
+ namespace: str,
1052
+ context: Optional[str],
1053
+ skip_ssh_service: bool = False) -> None:
1052
1054
  """Delete services with the given name prefix.
1053
1055
 
1054
1056
  Args:
@@ -1057,7 +1059,9 @@ def _delete_services(name_prefix: str, namespace: str,
1057
1059
  context: Kubernetes context
1058
1060
  """
1059
1061
  # TODO(andy): We should use tag for the service filter.
1060
- for service_name in [name_prefix, f'{name_prefix}-ssh']:
1062
+ services = ([name_prefix, f'{name_prefix}-ssh']
1063
+ if not skip_ssh_service else [name_prefix])
1064
+ for service_name in services:
1061
1065
  # Since we are not saving this lambda, it's a false positive.
1062
1066
  # TODO(andyl): Wait for
1063
1067
  # https://github.com/pylint-dev/pylint/issues/5263.
@@ -1083,6 +1087,9 @@ def _terminate_node(namespace: str,
1083
1087
  # Delete services for the head pod
1084
1088
  # services are specified in sky/templates/kubernetes-ray.yml.j2
1085
1089
  _delete_services(pod_name, namespace, context)
1090
+ else:
1091
+ # No ssh service is created for worker pods
1092
+ _delete_services(pod_name, namespace, context, skip_ssh_service=True)
1086
1093
 
1087
1094
  # Note - delete pod after all other resources are deleted.
1088
1095
  # This is to ensure there are no leftover resources if this down is run
@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
9
9
  from sky.provision.runpod.instance import stop_instances
10
10
  from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
+ from sky.provision.runpod.volume import apply_volume
13
+ from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_volume_usedby
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
80
80
  created_instance_ids=[])
81
81
 
82
82
  created_instance_ids = []
83
+ volume_mounts = config.node_config.get('VolumeMounts', [])
84
+ network_volume_id = None
85
+ volume_mount_path = None
86
+ if volume_mounts:
87
+ if len(volume_mounts) > 1:
88
+ logger.warning(
89
+ f'RunPod only supports one network volume mount, '
90
+ f'but {len(volume_mounts)} are specified. Only the first one '
91
+ f'will be used.')
92
+ volume_mount = volume_mounts[0]
93
+ network_volume_id = volume_mount.get('VolumeIdOnCloud')
94
+ volume_mount_path = volume_mount.get('MountPath')
95
+ if network_volume_id is None or volume_mount_path is None:
96
+ raise RuntimeError(
97
+ 'Network volume ID and mount path must be specified.')
83
98
  for _ in range(to_start_count):
84
99
  node_type = 'head' if head_instance_id is None else 'worker'
85
100
  try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
112
  bid_per_gpu=config.node_config['BidPerGPU'],
98
113
  docker_login_config=config.provider_config.get(
99
114
  'docker_login_config'),
115
+ network_volume_id=network_volume_id,
116
+ volume_mount_path=volume_mount_path,
100
117
  )
101
118
  except Exception as e: # pylint: disable=broad-except
102
119
  logger.warning(f'run_instances error: {e}')
@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
263
263
  return login_config.format_image(image_name), create_template_resp['id']
264
264
 
265
265
 
266
- def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
267
- zone: str, disk_size: int, image_name: str,
268
- ports: Optional[List[int]], public_key: str,
269
- preemptible: Optional[bool], bid_per_gpu: float,
270
- docker_login_config: Optional[Dict[str, str]]) -> str:
266
+ def launch(
267
+ cluster_name: str,
268
+ node_type: str,
269
+ instance_type: str,
270
+ region: str,
271
+ zone: str,
272
+ disk_size: int,
273
+ image_name: str,
274
+ ports: Optional[List[int]],
275
+ public_key: str,
276
+ preemptible: Optional[bool],
277
+ bid_per_gpu: float,
278
+ docker_login_config: Optional[Dict[str, str]],
279
+ *,
280
+ network_volume_id: Optional[str] = None,
281
+ volume_mount_path: Optional[str] = None,
282
+ ) -> str:
271
283
  """Launches an instance with the given parameters.
272
284
 
273
285
  For CPU instances, we directly use the instance_type for launching the
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
337
349
  'template_id': template_id,
338
350
  }
339
351
 
352
+ # Optional network volume mount.
353
+ if volume_mount_path is not None:
354
+ params['volume_mount_path'] = volume_mount_path
355
+ if network_volume_id is not None:
356
+ params['network_volume_id'] = network_volume_id
357
+
340
358
  # GPU instance types start with f'{gpu_count}x',
341
359
  # CPU instance types start with 'cpu'.
342
360
  is_cpu_instance = instance_type.startswith('cpu')
@@ -0,0 +1,158 @@
1
+ """RunPod network volume provisioning."""
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from sky import global_user_state
5
+ from sky import models
6
+ from sky import sky_logging
7
+ from sky.adaptors import runpod
8
+ from sky.utils import common_utils
9
+ from sky.utils import volume as volume_lib
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def _list_volumes() -> List[Dict[str, Any]]:
15
+ # GET /v1/networkvolumes returns a list
16
+ result = runpod.rest_request('GET', '/networkvolumes')
17
+ if isinstance(result, list):
18
+ return result
19
+ # Some deployments may wrap the list.
20
+ if isinstance(result, dict):
21
+ for key in ('items', 'data', 'networkVolumes'):
22
+ if key in result and isinstance(result[key], list):
23
+ return result[key]
24
+ return []
25
+
26
+
27
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
28
+ """Create or resolve a RunPod network volume via REST API.
29
+
30
+ If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
31
+ create a new one using POST /v1/networkvolumes.
32
+ """
33
+ name_on_cloud = config.name_on_cloud
34
+ assert name_on_cloud is not None
35
+
36
+ vol_id = _try_resolve_volume_id(name_on_cloud)
37
+ if vol_id is None:
38
+ # Create new volume via REST
39
+ size = config.size
40
+ if size is None:
41
+ raise RuntimeError(
42
+ 'RunPod network volume size must be specified to create '
43
+ 'a volume.')
44
+ try:
45
+ size_int = int(size)
46
+ if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
47
+ raise RuntimeError(
48
+ f'RunPod network volume size must be at least '
49
+ f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
50
+ except Exception as e: # pylint: disable=broad-except
51
+ raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
52
+ data_center_id = config.zone
53
+ if not data_center_id:
54
+ raise RuntimeError(
55
+ 'RunPod DataCenterId is required to create a network '
56
+ 'volume. Set the zone in the infra field.')
57
+ payload = {
58
+ 'dataCenterId': data_center_id,
59
+ 'name': name_on_cloud,
60
+ 'size': size_int,
61
+ }
62
+ resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
63
+ if isinstance(resp, dict):
64
+ config.id_on_cloud = resp.get('id')
65
+ else:
66
+ raise RuntimeError(
67
+ f'Failed to create RunPod network volume: {resp}')
68
+ logger.info(f'Created RunPod network volume {name_on_cloud} '
69
+ f'(id={config.id_on_cloud})')
70
+ return config
71
+
72
+ # Use existing matched volume
73
+ config.id_on_cloud = vol_id
74
+ logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
75
+ f'(id={config.id_on_cloud})')
76
+ return config
77
+
78
+
79
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
80
+ """Deletes a RunPod network volume via REST API if id is known or
81
+ resolvable. If the volume id is not known, try to resolve it by name.
82
+ """
83
+ name_on_cloud = config.name_on_cloud
84
+ vol_id = config.id_on_cloud
85
+ if not vol_id:
86
+ vol_id = _try_resolve_volume_id(name_on_cloud)
87
+ if not vol_id:
88
+ logger.warning(
89
+ f'RunPod network volume id not found for {name_on_cloud}; '
90
+ f'skip delete')
91
+ return config
92
+ runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
93
+ logger.info(f'Deleted RunPod network volume {name_on_cloud} '
94
+ f'(id={vol_id})')
95
+ return config
96
+
97
+
98
+ def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
99
+ vols = _list_volumes()
100
+ matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
101
+ if matched is not None:
102
+ return matched.get('id')
103
+ return None
104
+
105
+
106
+ def get_volume_usedby(
107
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
108
+ """Gets the clusters currently using this RunPod network volume.
109
+
110
+ Returns:
111
+ (usedby_pods, usedby_clusters)
112
+ usedby_clusters contains SkyPilot cluster display names inferred from
113
+ pod names, which may be wrong.
114
+ """
115
+ vol_id = config.id_on_cloud
116
+ name_on_cloud = config.name_on_cloud
117
+ if vol_id is None:
118
+ vol_id = _try_resolve_volume_id(name_on_cloud)
119
+ if vol_id is None:
120
+ return [], []
121
+
122
+ # Query all pods for current user and filter by networkVolumeId
123
+ query = """
124
+ query Pods {
125
+ myself {
126
+ pods {
127
+ id
128
+ name
129
+ networkVolumeId
130
+ }
131
+ }
132
+ }
133
+ """
134
+ resp = runpod.runpod.api.graphql.run_graphql_query(query)
135
+ pods = resp.get('data', {}).get('myself', {}).get('pods', [])
136
+ used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
137
+ usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
138
+
139
+ # Map pod names back to SkyPilot cluster names using heuristics.
140
+ clusters = global_user_state.get_clusters()
141
+ cluster_names: List[str] = []
142
+ user_hash = common_utils.get_user_hash()
143
+ for pod_name in usedby_pod_names:
144
+ matched = None
145
+ for c in clusters:
146
+ display = c.get('name')
147
+ if not display:
148
+ continue
149
+ # Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
150
+ # This can be wrong.
151
+ cluster_prefix = display + '-' + user_hash + '-'
152
+ if pod_name.startswith(cluster_prefix):
153
+ matched = display
154
+ break
155
+ if matched and matched not in cluster_names:
156
+ cluster_names.append(matched)
157
+
158
+ return usedby_pod_names, cluster_names
sky/serve/serve_state.py CHANGED
@@ -130,7 +130,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
130
130
  if _SQLALCHEMY_ENGINE is not None:
131
131
  return _SQLALCHEMY_ENGINE
132
132
  # get an engine to the db
133
- engine = migration_utils.get_engine('serve/services')
133
+ engine = db_utils.get_engine('serve/services')
134
134
 
135
135
  # run migrations if needed
136
136
  create_table(engine)
sky/server/config.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import dataclasses
4
4
  import enum
5
+ from typing import Optional
5
6
 
6
7
  from sky import sky_logging
7
8
  from sky.server import constants as server_constants
@@ -61,6 +62,7 @@ class QueueBackend(enum.Enum):
61
62
  class WorkerConfig:
62
63
  garanteed_parallelism: int
63
64
  burstable_parallelism: int
65
+ num_db_connections_per_worker: int
64
66
 
65
67
 
66
68
  @dataclasses.dataclass
@@ -68,10 +70,13 @@ class ServerConfig:
68
70
  num_server_workers: int
69
71
  long_worker_config: WorkerConfig
70
72
  short_worker_config: WorkerConfig
73
+ num_db_connections_per_worker: int
71
74
  queue_backend: QueueBackend
72
75
 
73
76
 
74
- def compute_server_config(deploy: bool) -> ServerConfig:
77
+ def compute_server_config(deploy: bool,
78
+ max_db_connections: Optional[int] = None
79
+ ) -> ServerConfig:
75
80
  """Compute the server config based on environment.
76
81
 
77
82
  We have different assumptions for the resources in different deployment
@@ -114,7 +119,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
114
119
  queue_backend = QueueBackend.MULTIPROCESSING
115
120
  burstable_parallel_for_long = 0
116
121
  burstable_parallel_for_short = 0
122
+ # if num_db_connections_per_worker is 0, server will use NullPool
123
+ # to conserve the number of concurrent db connections.
124
+ # This could lead to performance degradation.
125
+ num_db_connections_per_worker = 0
117
126
  num_server_workers = cpu_count
127
+
128
+ # +1 for the event loop running the main process
129
+ # and gc daemons in the '__main__' body of sky/server/server.py
130
+ max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
131
+ num_server_workers + 1)
132
+
118
133
  if not deploy:
119
134
  # For local mode, use local queue backend since we only run 1 uvicorn
120
135
  # worker in local mode and no multiprocessing is needed.
@@ -140,6 +155,16 @@ def compute_server_config(deploy: bool) -> ServerConfig:
140
155
  'SkyPilot API server will run in low resource mode because '
141
156
  'the available memory is less than '
142
157
  f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
158
+ elif max_db_connections is not None:
159
+ if max_parallel_all_workers > max_db_connections:
160
+ logger.warning(
161
+ f'Max parallel all workers ({max_parallel_all_workers}) '
162
+ f'is greater than max db connections ({max_db_connections}). '
163
+ 'Increase the number of max db connections to '
164
+ f'at least {max_parallel_all_workers} for optimal performance.')
165
+ else:
166
+ num_db_connections_per_worker = 1
167
+
143
168
  logger.info(
144
169
  f'SkyPilot API server will start {num_server_workers} server processes '
145
170
  f'with {max_parallel_for_long} background workers for long requests '
@@ -150,10 +175,13 @@ def compute_server_config(deploy: bool) -> ServerConfig:
150
175
  queue_backend=queue_backend,
151
176
  long_worker_config=WorkerConfig(
152
177
  garanteed_parallelism=max_parallel_for_long,
153
- burstable_parallelism=burstable_parallel_for_long),
178
+ burstable_parallelism=burstable_parallel_for_long,
179
+ num_db_connections_per_worker=num_db_connections_per_worker),
154
180
  short_worker_config=WorkerConfig(
155
181
  garanteed_parallelism=max_parallel_for_short,
156
- burstable_parallelism=burstable_parallel_for_short),
182
+ burstable_parallelism=burstable_parallel_for_short,
183
+ num_db_connections_per_worker=num_db_connections_per_worker),
184
+ num_db_connections_per_worker=num_db_connections_per_worker,
157
185
  )
158
186
 
159
187
 
@@ -57,6 +57,7 @@ from sky.utils import subprocess_utils
57
57
  from sky.utils import tempstore
58
58
  from sky.utils import timeline
59
59
  from sky.utils import yaml_utils
60
+ from sky.utils.db import db_utils
60
61
  from sky.workspaces import core as workspaces_core
61
62
 
62
63
  if typing.TYPE_CHECKING:
@@ -152,6 +153,8 @@ class RequestWorker:
152
153
  self.schedule_type = schedule_type
153
154
  self.garanteed_parallelism = config.garanteed_parallelism
154
155
  self.burstable_parallelism = config.burstable_parallelism
156
+ self.num_db_connections_per_worker = (
157
+ config.num_db_connections_per_worker)
155
158
  self._thread: Optional[threading.Thread] = None
156
159
  self._cancel_event = threading.Event()
157
160
 
@@ -190,8 +193,9 @@ class RequestWorker:
190
193
  # multiple requests can share the same process pid, which may cause
191
194
  # issues with SkyPilot core functions if they rely on the exit of
192
195
  # the process, such as subprocess_daemon.py.
193
- fut = executor.submit_until_success(_request_execution_wrapper,
194
- request_id, ignore_return_value)
196
+ fut = executor.submit_until_success(
197
+ _request_execution_wrapper, request_id, ignore_return_value,
198
+ self.num_db_connections_per_worker)
195
199
  # Monitor the result of the request execution.
196
200
  threading.Thread(target=self.handle_task_result,
197
201
  args=(fut, request_element),
@@ -351,7 +355,8 @@ def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
351
355
 
352
356
 
353
357
  def _request_execution_wrapper(request_id: str,
354
- ignore_return_value: bool) -> None:
358
+ ignore_return_value: bool,
359
+ num_db_connections_per_worker: int = 0) -> None:
355
360
  """Wrapper for a request execution.
356
361
 
357
362
  It wraps the execution of a request to:
@@ -362,6 +367,7 @@ def _request_execution_wrapper(request_id: str,
362
367
  4. Handle the SIGTERM signal to abort the request gracefully.
363
368
  5. Maintain the lifecycle of the temp dir used by the request.
364
369
  """
370
+ db_utils.set_max_connections(num_db_connections_per_worker)
365
371
  # Handle the SIGTERM signal to abort the request processing gracefully.
366
372
  signal.signal(signal.SIGTERM, _sigterm_handler)
367
373
 
@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
309
309
  cluster_names: Optional[List[str]] = None
310
310
  refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
311
311
  all_users: bool = True
312
- include_credentials: bool = False
312
+ # TODO (kyuds): default to False post 0.10.5
313
+ include_credentials: bool = True
313
314
 
314
315
 
315
316
  class StartBody(RequestBody):
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
464
465
  names: List[str]
465
466
 
466
467
 
468
+ class VolumeListBody(RequestBody):
469
+ """The request body for the volume list endpoint."""
470
+ pass
471
+
472
+
467
473
  class EndpointsBody(RequestBody):
468
474
  """The request body for the endpoint."""
469
475
  cluster: str
@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
162
162
  # We unify these situations into a single state: the process of starting
163
163
  # the cluster is done (either normally or abnormally) but cluster is not
164
164
  # in UP status.
165
- requests = api_requests.get_request_tasks(
166
- status=[
167
- api_requests.RequestStatus.RUNNING,
168
- api_requests.RequestStatus.PENDING
169
- ],
170
- include_request_names=['sky.launch', 'sky.start'],
171
- cluster_names=[self.cluster_name])
165
+ requests = await api_requests.get_request_tasks_async(
166
+ req_filter=api_requests.RequestTaskFilter(
167
+ status=[
168
+ api_requests.RequestStatus.RUNNING,
169
+ api_requests.RequestStatus.PENDING
170
+ ],
171
+ include_request_names=['sky.launch', 'sky.start'],
172
+ cluster_names=[self.cluster_name]))
172
173
  if len(requests) == 0:
173
174
  # No running or pending tasks, the start process is done.
174
175
  return True, None