skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250903__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/runpod.py +68 -0
  3. sky/backends/backend_utils.py +5 -3
  4. sky/client/cli/command.py +20 -5
  5. sky/clouds/kubernetes.py +1 -1
  6. sky/clouds/runpod.py +17 -0
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/chunks/1121-ec35954c8cbea535.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b77360a343d48902.js +16 -0
  10. sky/dashboard/out/_next/static/chunks/{webpack-0eaa6f7e63f51311.js → webpack-60556df644cd5d71.js} +1 -1
  11. sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  21. sky/dashboard/out/jobs.html +1 -1
  22. sky/dashboard/out/users.html +1 -1
  23. sky/dashboard/out/volumes.html +1 -1
  24. sky/dashboard/out/workspace/new.html +1 -1
  25. sky/dashboard/out/workspaces/[name].html +1 -1
  26. sky/dashboard/out/workspaces.html +1 -1
  27. sky/global_user_state.py +5 -2
  28. sky/models.py +1 -0
  29. sky/provision/runpod/__init__.py +3 -0
  30. sky/provision/runpod/instance.py +17 -0
  31. sky/provision/runpod/utils.py +23 -5
  32. sky/provision/runpod/volume.py +158 -0
  33. sky/server/requests/payloads.py +7 -1
  34. sky/server/requests/preconditions.py +8 -7
  35. sky/server/requests/requests.py +123 -57
  36. sky/server/server.py +32 -25
  37. sky/server/stream_utils.py +14 -6
  38. sky/server/uvicorn.py +2 -1
  39. sky/templates/kubernetes-ray.yml.j2 +5 -5
  40. sky/templates/runpod-ray.yml.j2 +8 -0
  41. sky/utils/benchmark_utils.py +60 -0
  42. sky/utils/command_runner.py +4 -0
  43. sky/utils/db/migration_utils.py +20 -4
  44. sky/utils/resource_checker.py +6 -5
  45. sky/utils/schemas.py +1 -1
  46. sky/utils/volume.py +3 -0
  47. sky/volumes/client/sdk.py +28 -0
  48. sky/volumes/server/server.py +11 -1
  49. sky/volumes/utils.py +117 -68
  50. sky/volumes/volume.py +98 -39
  51. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/METADATA +34 -34
  52. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/RECORD +57 -55
  53. sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
  54. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
  55. /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-0eaa6f7e63f51311.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8089ed1e0b7e37fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/tio0QibqY2C0F2-rPy00p/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/tio0QibqY2C0F2-rPy00p/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"tio0QibqY2C0F2-rPy00p","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-ec35954c8cbea535.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8089ed1e0b7e37fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-0eaa6f7e63f51311.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/tio0QibqY2C0F2-rPy00p/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/tio0QibqY2C0F2-rPy00p/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"tio0QibqY2C0F2-rPy00p","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -299,7 +299,9 @@ def create_table(engine: sqlalchemy.engine.Engine):
299
299
  # a session has already been created with _SQLALCHEMY_ENGINE = e1,
300
300
  # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
301
301
  # which could result in e1 being garbage collected unexpectedly.
302
- def initialize_and_get_db() -> sqlalchemy.engine.Engine:
302
+ def initialize_and_get_db(
303
+ pg_pool_class: Optional[sqlalchemy.pool.Pool] = None
304
+ ) -> sqlalchemy.engine.Engine:
303
305
  global _SQLALCHEMY_ENGINE
304
306
 
305
307
  if _SQLALCHEMY_ENGINE is not None:
@@ -308,7 +310,8 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
308
310
  if _SQLALCHEMY_ENGINE is not None:
309
311
  return _SQLALCHEMY_ENGINE
310
312
  # get an engine to the db
311
- engine = migration_utils.get_engine('state')
313
+ engine = migration_utils.get_engine('state',
314
+ pg_pool_class=pg_pool_class)
312
315
 
313
316
  # run migrations if needed
314
317
  create_table(engine)
sky/models.py CHANGED
@@ -109,3 +109,4 @@ class VolumeConfig(pydantic.BaseModel):
109
109
  size: Optional[str]
110
110
  config: Dict[str, Any] = {}
111
111
  labels: Optional[Dict[str, str]] = None
112
+ id_on_cloud: Optional[str] = None
@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
9
9
  from sky.provision.runpod.instance import stop_instances
10
10
  from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
+ from sky.provision.runpod.volume import apply_volume
13
+ from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_volume_usedby
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
80
80
  created_instance_ids=[])
81
81
 
82
82
  created_instance_ids = []
83
+ volume_mounts = config.node_config.get('VolumeMounts', [])
84
+ network_volume_id = None
85
+ volume_mount_path = None
86
+ if volume_mounts:
87
+ if len(volume_mounts) > 1:
88
+ logger.warning(
89
+ f'RunPod only supports one network volume mount, '
90
+ f'but {len(volume_mounts)} are specified. Only the first one '
91
+ f'will be used.')
92
+ volume_mount = volume_mounts[0]
93
+ network_volume_id = volume_mount.get('VolumeIdOnCloud')
94
+ volume_mount_path = volume_mount.get('MountPath')
95
+ if network_volume_id is None or volume_mount_path is None:
96
+ raise RuntimeError(
97
+ 'Network volume ID and mount path must be specified.')
83
98
  for _ in range(to_start_count):
84
99
  node_type = 'head' if head_instance_id is None else 'worker'
85
100
  try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
112
  bid_per_gpu=config.node_config['BidPerGPU'],
98
113
  docker_login_config=config.provider_config.get(
99
114
  'docker_login_config'),
115
+ network_volume_id=network_volume_id,
116
+ volume_mount_path=volume_mount_path,
100
117
  )
101
118
  except Exception as e: # pylint: disable=broad-except
102
119
  logger.warning(f'run_instances error: {e}')
@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
263
263
  return login_config.format_image(image_name), create_template_resp['id']
264
264
 
265
265
 
266
- def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
267
- zone: str, disk_size: int, image_name: str,
268
- ports: Optional[List[int]], public_key: str,
269
- preemptible: Optional[bool], bid_per_gpu: float,
270
- docker_login_config: Optional[Dict[str, str]]) -> str:
266
+ def launch(
267
+ cluster_name: str,
268
+ node_type: str,
269
+ instance_type: str,
270
+ region: str,
271
+ zone: str,
272
+ disk_size: int,
273
+ image_name: str,
274
+ ports: Optional[List[int]],
275
+ public_key: str,
276
+ preemptible: Optional[bool],
277
+ bid_per_gpu: float,
278
+ docker_login_config: Optional[Dict[str, str]],
279
+ *,
280
+ network_volume_id: Optional[str] = None,
281
+ volume_mount_path: Optional[str] = None,
282
+ ) -> str:
271
283
  """Launches an instance with the given parameters.
272
284
 
273
285
  For CPU instances, we directly use the instance_type for launching the
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
337
349
  'template_id': template_id,
338
350
  }
339
351
 
352
+ # Optional network volume mount.
353
+ if volume_mount_path is not None:
354
+ params['volume_mount_path'] = volume_mount_path
355
+ if network_volume_id is not None:
356
+ params['network_volume_id'] = network_volume_id
357
+
340
358
  # GPU instance types start with f'{gpu_count}x',
341
359
  # CPU instance types start with 'cpu'.
342
360
  is_cpu_instance = instance_type.startswith('cpu')
@@ -0,0 +1,158 @@
1
+ """RunPod network volume provisioning."""
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from sky import global_user_state
5
+ from sky import models
6
+ from sky import sky_logging
7
+ from sky.adaptors import runpod
8
+ from sky.utils import common_utils
9
+ from sky.utils import volume as volume_lib
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def _list_volumes() -> List[Dict[str, Any]]:
15
+ # GET /v1/networkvolumes returns a list
16
+ result = runpod.rest_request('GET', '/networkvolumes')
17
+ if isinstance(result, list):
18
+ return result
19
+ # Some deployments may wrap the list.
20
+ if isinstance(result, dict):
21
+ for key in ('items', 'data', 'networkVolumes'):
22
+ if key in result and isinstance(result[key], list):
23
+ return result[key]
24
+ return []
25
+
26
+
27
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
28
+ """Create or resolve a RunPod network volume via REST API.
29
+
30
+ If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
31
+ create a new one using POST /v1/networkvolumes.
32
+ """
33
+ name_on_cloud = config.name_on_cloud
34
+ assert name_on_cloud is not None
35
+
36
+ vol_id = _try_resolve_volume_id(name_on_cloud)
37
+ if vol_id is None:
38
+ # Create new volume via REST
39
+ size = config.size
40
+ if size is None:
41
+ raise RuntimeError(
42
+ 'RunPod network volume size must be specified to create '
43
+ 'a volume.')
44
+ try:
45
+ size_int = int(size)
46
+ if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
47
+ raise RuntimeError(
48
+ f'RunPod network volume size must be at least '
49
+ f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
50
+ except Exception as e: # pylint: disable=broad-except
51
+ raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
52
+ data_center_id = config.zone
53
+ if not data_center_id:
54
+ raise RuntimeError(
55
+ 'RunPod DataCenterId is required to create a network '
56
+ 'volume. Set the zone in the infra field.')
57
+ payload = {
58
+ 'dataCenterId': data_center_id,
59
+ 'name': name_on_cloud,
60
+ 'size': size_int,
61
+ }
62
+ resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
63
+ if isinstance(resp, dict):
64
+ config.id_on_cloud = resp.get('id')
65
+ else:
66
+ raise RuntimeError(
67
+ f'Failed to create RunPod network volume: {resp}')
68
+ logger.info(f'Created RunPod network volume {name_on_cloud} '
69
+ f'(id={config.id_on_cloud})')
70
+ return config
71
+
72
+ # Use existing matched volume
73
+ config.id_on_cloud = vol_id
74
+ logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
75
+ f'(id={config.id_on_cloud})')
76
+ return config
77
+
78
+
79
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
80
+ """Deletes a RunPod network volume via REST API if id is known or
81
+ resolvable. If the volume id is not known, try to resolve it by name.
82
+ """
83
+ name_on_cloud = config.name_on_cloud
84
+ vol_id = config.id_on_cloud
85
+ if not vol_id:
86
+ vol_id = _try_resolve_volume_id(name_on_cloud)
87
+ if not vol_id:
88
+ logger.warning(
89
+ f'RunPod network volume id not found for {name_on_cloud}; '
90
+ f'skip delete')
91
+ return config
92
+ runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
93
+ logger.info(f'Deleted RunPod network volume {name_on_cloud} '
94
+ f'(id={vol_id})')
95
+ return config
96
+
97
+
98
+ def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
99
+ vols = _list_volumes()
100
+ matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
101
+ if matched is not None:
102
+ return matched.get('id')
103
+ return None
104
+
105
+
106
+ def get_volume_usedby(
107
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
108
+ """Gets the clusters currently using this RunPod network volume.
109
+
110
+ Returns:
111
+ (usedby_pods, usedby_clusters)
112
+ usedby_clusters contains SkyPilot cluster display names inferred from
113
+ pod names, which may be wrong.
114
+ """
115
+ vol_id = config.id_on_cloud
116
+ name_on_cloud = config.name_on_cloud
117
+ if vol_id is None:
118
+ vol_id = _try_resolve_volume_id(name_on_cloud)
119
+ if vol_id is None:
120
+ return [], []
121
+
122
+ # Query all pods for current user and filter by networkVolumeId
123
+ query = """
124
+ query Pods {
125
+ myself {
126
+ pods {
127
+ id
128
+ name
129
+ networkVolumeId
130
+ }
131
+ }
132
+ }
133
+ """
134
+ resp = runpod.runpod.api.graphql.run_graphql_query(query)
135
+ pods = resp.get('data', {}).get('myself', {}).get('pods', [])
136
+ used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
137
+ usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
138
+
139
+ # Map pod names back to SkyPilot cluster names using heuristics.
140
+ clusters = global_user_state.get_clusters()
141
+ cluster_names: List[str] = []
142
+ user_hash = common_utils.get_user_hash()
143
+ for pod_name in usedby_pod_names:
144
+ matched = None
145
+ for c in clusters:
146
+ display = c.get('name')
147
+ if not display:
148
+ continue
149
+ # Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
150
+ # This can be wrong.
151
+ cluster_prefix = display + '-' + user_hash + '-'
152
+ if pod_name.startswith(cluster_prefix):
153
+ matched = display
154
+ break
155
+ if matched and matched not in cluster_names:
156
+ cluster_names.append(matched)
157
+
158
+ return usedby_pod_names, cluster_names
@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
309
309
  cluster_names: Optional[List[str]] = None
310
310
  refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
311
311
  all_users: bool = True
312
- include_credentials: bool = False
312
+ # TODO (kyuds): default to False post 0.10.5
313
+ include_credentials: bool = True
313
314
 
314
315
 
315
316
  class StartBody(RequestBody):
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
464
465
  names: List[str]
465
466
 
466
467
 
468
+ class VolumeListBody(RequestBody):
469
+ """The request body for the volume list endpoint."""
470
+ pass
471
+
472
+
467
473
  class EndpointsBody(RequestBody):
468
474
  """The request body for the endpoint."""
469
475
  cluster: str
@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
162
162
  # We unify these situations into a single state: the process of starting
163
163
  # the cluster is done (either normally or abnormally) but cluster is not
164
164
  # in UP status.
165
- requests = api_requests.get_request_tasks(
166
- status=[
167
- api_requests.RequestStatus.RUNNING,
168
- api_requests.RequestStatus.PENDING
169
- ],
170
- include_request_names=['sky.launch', 'sky.start'],
171
- cluster_names=[self.cluster_name])
165
+ requests = await api_requests.get_request_tasks_async(
166
+ req_filter=api_requests.RequestTaskFilter(
167
+ status=[
168
+ api_requests.RequestStatus.RUNNING,
169
+ api_requests.RequestStatus.PENDING
170
+ ],
171
+ include_request_names=['sky.launch', 'sky.start'],
172
+ cluster_names=[self.cluster_name]))
172
173
  if len(requests) == 0:
173
174
  # No running or pending tasks, the start process is done.
174
175
  return True, None
@@ -14,7 +14,7 @@ import threading
14
14
  import time
15
15
  import traceback
16
16
  from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
17
- Optional, Tuple)
17
+ NamedTuple, Optional, Tuple)
18
18
 
19
19
  import colorama
20
20
  import filelock
@@ -300,10 +300,11 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
300
300
  prevent killing the caller request.
301
301
  """
302
302
  request_ids = [
303
- request_task.request_id for request_task in get_request_tasks(
303
+ request_task.request_id
304
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
304
305
  cluster_names=[cluster_name],
305
306
  status=[RequestStatus.PENDING, RequestStatus.RUNNING],
306
- exclude_request_names=[exclude_request_name])
307
+ exclude_request_names=[exclude_request_name]))
307
308
  ]
308
309
  kill_requests(request_ids)
309
310
 
@@ -323,11 +324,12 @@ def kill_requests(request_ids: Optional[List[str]] = None,
323
324
  """
324
325
  if request_ids is None:
325
326
  request_ids = [
326
- request_task.request_id for request_task in get_request_tasks(
327
+ request_task.request_id
328
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
327
329
  user_id=user_id,
328
330
  status=[RequestStatus.RUNNING, RequestStatus.PENDING],
329
331
  # Avoid cancelling the cancel request itself.
330
- exclude_request_names=['sky.api_cancel'])
332
+ exclude_request_names=['sky.api_cancel']))
331
333
  ]
332
334
  cancelled_request_ids = []
333
335
  for request_id in request_ids:
@@ -548,6 +550,40 @@ async def get_request_async(request_id: str) -> Optional[Request]:
548
550
  return await _get_request_no_lock_async(request_id)
549
551
 
550
552
 
553
+ class StatusWithMsg(NamedTuple):
554
+ status: RequestStatus
555
+ status_msg: Optional[str] = None
556
+
557
+
558
+ @init_db_async
559
+ @metrics_lib.time_me_async
560
+ async def get_request_status_async(
561
+ request_id: str,
562
+ include_msg: bool = False,
563
+ ) -> Optional[StatusWithMsg]:
564
+ """Get the status of a request.
565
+
566
+ Args:
567
+ request_id: The ID of the request.
568
+ include_msg: Whether to include the status message.
569
+
570
+ Returns:
571
+ The status of the request. If the request is not found, returns
572
+ None.
573
+ """
574
+ assert _DB is not None
575
+ columns = 'status'
576
+ if include_msg:
577
+ columns += ', status_msg'
578
+ sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
579
+ async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
580
+ if rows is None or len(rows) == 0:
581
+ return None
582
+ status = RequestStatus(rows[0][0])
583
+ status_msg = rows[0][1] if include_msg else None
584
+ return StatusWithMsg(status, status_msg)
585
+
586
+
551
587
  @init_db
552
588
  @metrics_lib.time_me
553
589
  def create_if_not_exists(request: Request) -> bool:
@@ -570,17 +606,9 @@ async def create_if_not_exists_async(request: Request) -> bool:
570
606
  return True
571
607
 
572
608
 
573
- @init_db
574
- @metrics_lib.time_me
575
- def get_request_tasks(
576
- status: Optional[List[RequestStatus]] = None,
577
- cluster_names: Optional[List[str]] = None,
578
- user_id: Optional[str] = None,
579
- exclude_request_names: Optional[List[str]] = None,
580
- include_request_names: Optional[List[str]] = None,
581
- finished_before: Optional[float] = None,
582
- ) -> List[Request]:
583
- """Get a list of requests that match the given filters.
609
+ @dataclasses.dataclass
610
+ class RequestTaskFilter:
611
+ """Filter for requests.
584
612
 
585
613
  Args:
586
614
  status: a list of statuses of the requests to filter on.
@@ -598,51 +626,87 @@ def get_request_tasks(
598
626
  ValueError: If both exclude_request_names and include_request_names are
599
627
  provided.
600
628
  """
601
- if exclude_request_names is not None and include_request_names is not None:
602
- raise ValueError(
603
- 'Only one of exclude_request_names or include_request_names can be '
604
- 'provided, not both.')
605
-
606
- filters = []
607
- filter_params: List[Any] = []
608
- if status is not None:
609
- status_list_str = ','.join(repr(status.value) for status in status)
610
- filters.append(f'status IN ({status_list_str})')
611
- if exclude_request_names is not None:
612
- exclude_request_names_str = ','.join(
613
- repr(name) for name in exclude_request_names)
614
- filters.append(f'name NOT IN ({exclude_request_names_str})')
615
- if cluster_names is not None:
616
- cluster_names_str = ','.join(repr(name) for name in cluster_names)
617
- filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
618
- if user_id is not None:
619
- filters.append(f'{COL_USER_ID} = ?')
620
- filter_params.append(user_id)
621
- if include_request_names is not None:
622
- request_names_str = ','.join(
623
- repr(name) for name in include_request_names)
624
- filters.append(f'name IN ({request_names_str})')
625
- if finished_before is not None:
626
- filters.append('finished_at < ?')
627
- filter_params.append(finished_before)
628
- assert _DB is not None
629
- with _DB.conn:
630
- cursor = _DB.conn.cursor()
629
+ status: Optional[List[RequestStatus]] = None
630
+ cluster_names: Optional[List[str]] = None
631
+ user_id: Optional[str] = None
632
+ exclude_request_names: Optional[List[str]] = None
633
+ include_request_names: Optional[List[str]] = None
634
+ finished_before: Optional[float] = None
635
+
636
+ def __post_init__(self):
637
+ if (self.exclude_request_names is not None and
638
+ self.include_request_names is not None):
639
+ raise ValueError(
640
+ 'Only one of exclude_request_names or include_request_names '
641
+ 'can be provided, not both.')
642
+
643
+ def build_query(self) -> Tuple[str, List[Any]]:
644
+ """Build the SQL query and filter parameters.
645
+
646
+ Returns:
647
+ A tuple of (SQL, SQL parameters).
648
+ """
649
+ filters = []
650
+ filter_params: List[Any] = []
651
+ if self.status is not None:
652
+ status_list_str = ','.join(
653
+ repr(status.value) for status in self.status)
654
+ filters.append(f'status IN ({status_list_str})')
655
+ if self.exclude_request_names is not None:
656
+ exclude_request_names_str = ','.join(
657
+ repr(name) for name in self.exclude_request_names)
658
+ filters.append(f'name NOT IN ({exclude_request_names_str})')
659
+ if self.cluster_names is not None:
660
+ cluster_names_str = ','.join(
661
+ repr(name) for name in self.cluster_names)
662
+ filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
663
+ if self.user_id is not None:
664
+ filters.append(f'{COL_USER_ID} = ?')
665
+ filter_params.append(self.user_id)
666
+ if self.include_request_names is not None:
667
+ request_names_str = ','.join(
668
+ repr(name) for name in self.include_request_names)
669
+ filters.append(f'name IN ({request_names_str})')
670
+ if self.finished_before is not None:
671
+ filters.append('finished_at < ?')
672
+ filter_params.append(self.finished_before)
631
673
  filter_str = ' AND '.join(filters)
632
674
  if filter_str:
633
675
  filter_str = f' WHERE {filter_str}'
634
676
  columns_str = ', '.join(REQUEST_COLUMNS)
635
- cursor.execute(
636
- f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
637
- 'ORDER BY created_at DESC', filter_params)
677
+ return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
678
+ 'ORDER BY created_at DESC'), filter_params
679
+
680
+
681
+ @init_db
682
+ @metrics_lib.time_me
683
+ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
684
+ """Get a list of requests that match the given filters.
685
+
686
+ Args:
687
+ req_filter: the filter to apply to the requests. Refer to
688
+ RequestTaskFilter for the details.
689
+ """
690
+ assert _DB is not None
691
+ with _DB.conn:
692
+ cursor = _DB.conn.cursor()
693
+ cursor.execute(*req_filter.build_query())
638
694
  rows = cursor.fetchall()
639
695
  if rows is None:
640
696
  return []
641
- requests = []
642
- for row in rows:
643
- request = Request.from_row(row)
644
- requests.append(request)
645
- return requests
697
+ return [Request.from_row(row) for row in rows]
698
+
699
+
700
+ @init_db_async
701
+ @metrics_lib.time_me_async
702
+ async def get_request_tasks_async(
703
+ req_filter: RequestTaskFilter) -> List[Request]:
704
+ """Async version of get_request_tasks."""
705
+ assert _DB is not None
706
+ async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
707
+ if not rows:
708
+ return []
709
+ return [Request.from_row(row) for row in rows]
646
710
 
647
711
 
648
712
  @init_db_async
@@ -739,8 +803,10 @@ def clean_finished_requests_with_retention(retention_seconds: int):
739
803
  retention_seconds: Requests older than this many seconds will be
740
804
  deleted.
741
805
  """
742
- reqs = get_request_tasks(status=RequestStatus.finished_status(),
743
- finished_before=time.time() - retention_seconds)
806
+ reqs = get_request_tasks(
807
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
808
+ finished_before=time.time() -
809
+ retention_seconds))
744
810
 
745
811
  subprocess_utils.run_in_parallel(
746
812
  func=lambda req: req.log_path.unlink(missing_ok=True),
@@ -767,7 +833,7 @@ async def requests_gc_daemon():
767
833
  try:
768
834
  # Negative value disables the requests GC
769
835
  if retention_seconds >= 0:
770
- clean_finished_requests_with_retention(retention_seconds)
836
+ await clean_finished_requests_with_retention(retention_seconds)
771
837
  except asyncio.CancelledError:
772
838
  logger.info('Requests GC daemon cancelled')
773
839
  break