skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
|
|
|
144
144
|
DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
|
|
145
145
|
|
|
146
146
|
MEMORY_SIZE_UNITS = {
|
|
147
|
+
'm': 0.001,
|
|
147
148
|
'B': 1,
|
|
148
149
|
'K': 2**10,
|
|
149
150
|
'M': 2**20,
|
|
@@ -1205,15 +1206,24 @@ class V1NodeAddress:
|
|
|
1205
1206
|
address: str
|
|
1206
1207
|
|
|
1207
1208
|
|
|
1209
|
+
@dataclasses.dataclass
|
|
1210
|
+
class V1NodeCondition:
|
|
1211
|
+
"""Represents a Kubernetes node condition."""
|
|
1212
|
+
type: str
|
|
1213
|
+
status: str
|
|
1214
|
+
|
|
1215
|
+
|
|
1208
1216
|
@dataclasses.dataclass
|
|
1209
1217
|
class V1NodeStatus:
|
|
1210
1218
|
allocatable: Dict[str, str]
|
|
1211
1219
|
capacity: Dict[str, str]
|
|
1212
1220
|
addresses: List[V1NodeAddress]
|
|
1221
|
+
conditions: List[V1NodeCondition]
|
|
1213
1222
|
|
|
1214
1223
|
|
|
1215
1224
|
@dataclasses.dataclass
|
|
1216
1225
|
class V1Node:
|
|
1226
|
+
"""Represents a Kubernetes node."""
|
|
1217
1227
|
metadata: V1ObjectMeta
|
|
1218
1228
|
status: V1NodeStatus
|
|
1219
1229
|
|
|
@@ -1231,8 +1241,24 @@ class V1Node:
|
|
|
1231
1241
|
V1NodeAddress(type=addr['type'],
|
|
1232
1242
|
address=addr['address'])
|
|
1233
1243
|
for addr in data['status'].get('addresses', [])
|
|
1244
|
+
],
|
|
1245
|
+
conditions=[
|
|
1246
|
+
V1NodeCondition(type=cond['type'],
|
|
1247
|
+
status=cond['status'])
|
|
1248
|
+
for cond in data['status'].get('conditions', [])
|
|
1234
1249
|
]))
|
|
1235
1250
|
|
|
1251
|
+
def is_ready(self) -> bool:
|
|
1252
|
+
"""Check if the node is ready based on its conditions.
|
|
1253
|
+
|
|
1254
|
+
A node is considered ready if it has a 'Ready' condition with
|
|
1255
|
+
status 'True'.
|
|
1256
|
+
"""
|
|
1257
|
+
for condition in self.status.conditions:
|
|
1258
|
+
if condition.type == 'Ready':
|
|
1259
|
+
return condition.status == 'True'
|
|
1260
|
+
return False
|
|
1261
|
+
|
|
1236
1262
|
|
|
1237
1263
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
1238
1264
|
@_retry_on_error(resource_type='node')
|
|
@@ -1306,12 +1332,20 @@ class V1Pod:
|
|
|
1306
1332
|
|
|
1307
1333
|
|
|
1308
1334
|
@_retry_on_error(resource_type='pod')
|
|
1309
|
-
def
|
|
1335
|
+
def get_allocated_resources_by_node(
|
|
1310
1336
|
*,
|
|
1311
1337
|
context: Optional[str] = None,
|
|
1312
|
-
) -> Dict[str, int]:
|
|
1313
|
-
"""Gets allocated GPU
|
|
1338
|
+
) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
|
|
1339
|
+
"""Gets allocated GPU, CPU, and memory by each node by fetching pods in
|
|
1314
1340
|
all namespaces in kubernetes cluster indicated by context.
|
|
1341
|
+
|
|
1342
|
+
This function combines GPU and CPU/memory allocation tracking into a single
|
|
1343
|
+
API call for better performance.
|
|
1344
|
+
|
|
1345
|
+
Returns:
|
|
1346
|
+
Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
|
|
1347
|
+
- allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
|
|
1348
|
+
- allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
|
|
1315
1349
|
"""
|
|
1316
1350
|
if context is None:
|
|
1317
1351
|
context = get_current_kube_config_context_name()
|
|
@@ -1330,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
|
|
|
1330
1364
|
field_selector=field_selector)
|
|
1331
1365
|
try:
|
|
1332
1366
|
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1367
|
+
allocated_cpu_memory_by_node: Dict[str, Tuple[
|
|
1368
|
+
float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
|
|
1333
1369
|
for item_dict in ijson.items(response,
|
|
1334
1370
|
'items.item',
|
|
1335
1371
|
buf_size=IJSON_BUFFER_SIZE):
|
|
1336
1372
|
pod = V1Pod.from_dict(item_dict)
|
|
1337
1373
|
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1338
1374
|
logger.debug(
|
|
1339
|
-
f'Excluding pod {pod.metadata.name} from
|
|
1375
|
+
f'Excluding pod {pod.metadata.name} from resource count '
|
|
1340
1376
|
f'calculations on node {pod.spec.node_name}')
|
|
1341
1377
|
continue
|
|
1342
|
-
|
|
1343
|
-
|
|
1378
|
+
if not pod.spec.node_name:
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1381
|
+
# Iterate over all the containers in the pod and sum the resources
|
|
1344
1382
|
pod_allocated_qty = 0
|
|
1383
|
+
pod_allocated_cpu = 0.0
|
|
1384
|
+
pod_allocated_memory_gb = 0.0
|
|
1345
1385
|
for container in pod.spec.containers:
|
|
1346
1386
|
if container.resources.requests:
|
|
1387
|
+
requests = container.resources.requests
|
|
1388
|
+
# Parse GPU
|
|
1347
1389
|
pod_allocated_qty += get_node_accelerator_count(
|
|
1348
|
-
context,
|
|
1349
|
-
|
|
1390
|
+
context, requests)
|
|
1391
|
+
# Parse CPU
|
|
1392
|
+
if 'cpu' in requests:
|
|
1393
|
+
pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
|
|
1394
|
+
requests['cpu'])
|
|
1395
|
+
# Parse memory
|
|
1396
|
+
if 'memory' in requests:
|
|
1397
|
+
pod_allocated_memory_gb += parse_memory_resource(
|
|
1398
|
+
requests['memory'], unit='G')
|
|
1399
|
+
|
|
1400
|
+
if pod_allocated_qty > 0:
|
|
1350
1401
|
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1351
|
-
|
|
1402
|
+
if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
|
|
1403
|
+
current_cpu, current_memory = allocated_cpu_memory_by_node[
|
|
1404
|
+
pod.spec.node_name]
|
|
1405
|
+
allocated_cpu_memory_by_node[pod.spec.node_name] = (
|
|
1406
|
+
current_cpu + pod_allocated_cpu,
|
|
1407
|
+
current_memory + pod_allocated_memory_gb)
|
|
1408
|
+
return allocated_qty_by_node, allocated_cpu_memory_by_node
|
|
1352
1409
|
finally:
|
|
1353
1410
|
response.release_conn()
|
|
1354
1411
|
|
|
1355
1412
|
|
|
1413
|
+
@_retry_on_error(resource_type='pod')
|
|
1414
|
+
def get_allocated_gpu_qty_by_node(
|
|
1415
|
+
*,
|
|
1416
|
+
context: Optional[str] = None,
|
|
1417
|
+
) -> Dict[str, int]:
|
|
1418
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1419
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
1420
|
+
|
|
1421
|
+
Note: For better performance when you also need CPU/memory allocation,
|
|
1422
|
+
use get_allocated_resources_by_node() instead.
|
|
1423
|
+
"""
|
|
1424
|
+
allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
|
|
1425
|
+
return allocated_qty_by_node
|
|
1426
|
+
|
|
1427
|
+
|
|
1356
1428
|
def check_instance_fits(context: Optional[str],
|
|
1357
1429
|
instance: str) -> Tuple[bool, Optional[str]]:
|
|
1358
1430
|
"""Checks if the instance fits on the Kubernetes cluster.
|
|
@@ -1451,11 +1523,12 @@ def check_instance_fits(context: Optional[str],
|
|
|
1451
1523
|
return False, str(e)
|
|
1452
1524
|
# Get the set of nodes that have the GPU type
|
|
1453
1525
|
gpu_nodes = [
|
|
1454
|
-
node for node in nodes
|
|
1526
|
+
node for node in nodes
|
|
1527
|
+
if node.is_ready() and gpu_label_key in node.metadata.labels and
|
|
1455
1528
|
node.metadata.labels[gpu_label_key] in gpu_label_values
|
|
1456
1529
|
]
|
|
1457
1530
|
if not gpu_nodes:
|
|
1458
|
-
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
|
1531
|
+
return False, f'No ready GPU nodes found with {acc_type} on the cluster'
|
|
1459
1532
|
if is_tpu_on_gke(acc_type):
|
|
1460
1533
|
# If requested accelerator is a TPU type, check if the cluster
|
|
1461
1534
|
# has sufficient TPU resource to meet the requirement.
|
|
@@ -1479,7 +1552,9 @@ def check_instance_fits(context: Optional[str],
|
|
|
1479
1552
|
f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
|
|
1480
1553
|
f'memory (> {k8s_instance_type.memory} G). ')
|
|
1481
1554
|
else:
|
|
1482
|
-
candidate_nodes = nodes
|
|
1555
|
+
candidate_nodes = [node for node in nodes if node.is_ready()]
|
|
1556
|
+
if not candidate_nodes:
|
|
1557
|
+
return False, 'No ready nodes found in the cluster.'
|
|
1483
1558
|
not_fit_reason_prefix = (f'No nodes found with enough '
|
|
1484
1559
|
f'CPU (> {k8s_instance_type.cpus} CPUs) '
|
|
1485
1560
|
'and/or memory '
|
|
@@ -2161,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
|
|
|
2161
2236
|
_, current_context = kubernetes.list_kube_config_contexts()
|
|
2162
2237
|
return current_context['name']
|
|
2163
2238
|
except k8s.config.config_exception.ConfigException:
|
|
2239
|
+
# If kubeconfig is not available, check if running in-cluster and
|
|
2240
|
+
# return the in-cluster context name. This is needed when kubeconfig
|
|
2241
|
+
# is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
|
|
2242
|
+
# but we still need to know the context name for operations like
|
|
2243
|
+
# port mode detection.
|
|
2244
|
+
if is_incluster_config_available():
|
|
2245
|
+
return kubernetes.in_cluster_context_name()
|
|
2164
2246
|
return None
|
|
2165
2247
|
|
|
2166
2248
|
|
|
@@ -2285,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
|
|
|
2285
2367
|
try:
|
|
2286
2368
|
bytes_value = int(resource_str)
|
|
2287
2369
|
except ValueError:
|
|
2288
|
-
memory_size = re.sub(r'([
|
|
2370
|
+
memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
|
|
2289
2371
|
number, unit_index = [item.strip() for item in memory_size.split()]
|
|
2290
2372
|
unit_index = unit_index[0]
|
|
2291
2373
|
bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
|
|
@@ -3033,16 +3115,32 @@ def get_kubernetes_node_info(
|
|
|
3033
3115
|
has_accelerator_nodes = True
|
|
3034
3116
|
break
|
|
3035
3117
|
|
|
3036
|
-
# Get the allocated GPU
|
|
3118
|
+
# Get the allocated resources (GPU, CPU, memory) by each node in a single call
|
|
3037
3119
|
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
3038
|
-
|
|
3120
|
+
allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
|
|
3121
|
+
error_on_get_allocated_resources = False
|
|
3122
|
+
# Get resource allocation. For GPU allocation, only call if there are GPU nodes
|
|
3123
|
+
# (same as master branch). For CPU/memory, we always need it for all nodes.
|
|
3039
3124
|
if has_accelerator_nodes:
|
|
3125
|
+
# When there are GPU nodes, get both GPU and CPU/memory in one call
|
|
3040
3126
|
try:
|
|
3041
|
-
allocated_qty_by_node =
|
|
3127
|
+
allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
|
|
3042
3128
|
context=context)
|
|
3043
3129
|
except kubernetes.api_exception() as e:
|
|
3044
3130
|
if e.status == 403:
|
|
3045
|
-
|
|
3131
|
+
error_on_get_allocated_resources = True
|
|
3132
|
+
pass
|
|
3133
|
+
else:
|
|
3134
|
+
raise
|
|
3135
|
+
else:
|
|
3136
|
+
# When there are no GPU nodes, we still need CPU/memory allocation
|
|
3137
|
+
# This is an extra API call compared to master branch
|
|
3138
|
+
try:
|
|
3139
|
+
_, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
|
|
3140
|
+
context=context)
|
|
3141
|
+
except kubernetes.api_exception() as e:
|
|
3142
|
+
if e.status == 403:
|
|
3143
|
+
error_on_get_allocated_resources = True
|
|
3046
3144
|
pass
|
|
3047
3145
|
else:
|
|
3048
3146
|
raise
|
|
@@ -3078,16 +3176,56 @@ def get_kubernetes_node_info(
|
|
|
3078
3176
|
|
|
3079
3177
|
accelerator_count = get_node_accelerator_count(context,
|
|
3080
3178
|
node.status.allocatable)
|
|
3179
|
+
|
|
3180
|
+
# Parse CPU and memory from node capacity
|
|
3181
|
+
cpu_count = None
|
|
3182
|
+
memory_gb = None
|
|
3183
|
+
try:
|
|
3184
|
+
if 'cpu' in node.status.capacity:
|
|
3185
|
+
cpu_count = float(
|
|
3186
|
+
parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
|
|
3187
|
+
if 'memory' in node.status.capacity:
|
|
3188
|
+
memory_gb = parse_memory_resource(
|
|
3189
|
+
node.status.capacity['memory'], unit='G')
|
|
3190
|
+
except (KeyError, ValueError) as e:
|
|
3191
|
+
# If parsing fails, log but continue
|
|
3192
|
+
logger.debug(f'Failed to parse CPU/memory for node '
|
|
3193
|
+
f'{node.metadata.name}: {e}')
|
|
3194
|
+
|
|
3195
|
+
# Calculate free CPU and memory
|
|
3196
|
+
cpu_free = None
|
|
3197
|
+
memory_free_gb = None
|
|
3198
|
+
if cpu_count is not None or memory_gb is not None:
|
|
3199
|
+
if not error_on_get_allocated_resources:
|
|
3200
|
+
allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
|
|
3201
|
+
node.metadata.name, (0.0, 0.0))
|
|
3202
|
+
if cpu_count is not None:
|
|
3203
|
+
cpu_free = max(0.0, cpu_count - allocated_cpu)
|
|
3204
|
+
if memory_gb is not None:
|
|
3205
|
+
memory_free_gb = max(0.0, memory_gb - allocated_memory)
|
|
3206
|
+
# If we can't get allocation info, set free to None (unknown)
|
|
3207
|
+
|
|
3208
|
+
# Check if node is ready
|
|
3209
|
+
node_is_ready = node.is_ready()
|
|
3210
|
+
|
|
3081
3211
|
if accelerator_count == 0:
|
|
3082
3212
|
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
3083
3213
|
name=node.metadata.name,
|
|
3084
3214
|
accelerator_type=accelerator_name,
|
|
3085
3215
|
total={'accelerator_count': 0},
|
|
3086
3216
|
free={'accelerators_available': 0},
|
|
3087
|
-
ip_address=node_ip
|
|
3217
|
+
ip_address=node_ip,
|
|
3218
|
+
cpu_count=cpu_count,
|
|
3219
|
+
memory_gb=memory_gb,
|
|
3220
|
+
cpu_free=cpu_free,
|
|
3221
|
+
memory_free_gb=memory_free_gb,
|
|
3222
|
+
is_ready=node_is_ready)
|
|
3088
3223
|
continue
|
|
3089
3224
|
|
|
3090
|
-
if not
|
|
3225
|
+
if not node_is_ready:
|
|
3226
|
+
# If node is not ready, report 0 available GPUs
|
|
3227
|
+
accelerators_available = 0
|
|
3228
|
+
elif not has_accelerator_nodes or error_on_get_allocated_resources:
|
|
3091
3229
|
accelerators_available = -1
|
|
3092
3230
|
else:
|
|
3093
3231
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
@@ -3105,7 +3243,12 @@ def get_kubernetes_node_info(
|
|
|
3105
3243
|
accelerator_type=accelerator_name,
|
|
3106
3244
|
total={'accelerator_count': int(accelerator_count)},
|
|
3107
3245
|
free={'accelerators_available': int(accelerators_available)},
|
|
3108
|
-
ip_address=node_ip
|
|
3246
|
+
ip_address=node_ip,
|
|
3247
|
+
cpu_count=cpu_count,
|
|
3248
|
+
memory_gb=memory_gb,
|
|
3249
|
+
cpu_free=cpu_free,
|
|
3250
|
+
memory_free_gb=memory_free_gb,
|
|
3251
|
+
is_ready=node_is_ready)
|
|
3109
3252
|
hint = ''
|
|
3110
3253
|
if has_multi_host_tpu:
|
|
3111
3254
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
|
@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
|
|
45
45
|
continue
|
|
46
46
|
pvc = kubernetes.core_api(
|
|
47
47
|
context).read_namespaced_persistent_volume_claim(
|
|
48
|
-
name=pvc_name,
|
|
48
|
+
name=pvc_name,
|
|
49
|
+
namespace=namespace,
|
|
50
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
49
51
|
access_mode = pvc.spec.access_modes[0]
|
|
50
52
|
if access_mode not in once_modes:
|
|
51
53
|
continue
|
|
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
65
67
|
if storage_class_name is not None:
|
|
66
68
|
try:
|
|
67
69
|
kubernetes.storage_api(context).read_storage_class(
|
|
68
|
-
name=storage_class_name
|
|
70
|
+
name=storage_class_name,
|
|
71
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
69
72
|
except kubernetes.api_exception() as e:
|
|
70
73
|
raise config_lib.KubernetesError(
|
|
71
74
|
f'Check storage class {storage_class_name} error: {e}')
|
|
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
82
85
|
context).delete_namespaced_persistent_volume_claim(
|
|
83
86
|
name=pvc_name,
|
|
84
87
|
namespace=namespace,
|
|
85
|
-
_request_timeout=
|
|
88
|
+
_request_timeout=kubernetes.API_TIMEOUT),
|
|
86
89
|
resource_type='pvc',
|
|
87
90
|
resource_name=pvc_name)
|
|
88
91
|
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
@@ -119,7 +122,9 @@ def _get_volume_usedby(
|
|
|
119
122
|
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
120
123
|
# Get all pods in the namespace
|
|
121
124
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
122
|
-
namespace=namespace,
|
|
125
|
+
namespace=namespace,
|
|
126
|
+
field_selector=field_selector,
|
|
127
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
123
128
|
for pod in pods.items:
|
|
124
129
|
if pod.spec.volumes is None:
|
|
125
130
|
continue
|
|
@@ -164,8 +169,21 @@ def get_volume_usedby(
|
|
|
164
169
|
|
|
165
170
|
def get_all_volumes_usedby(
|
|
166
171
|
configs: List[models.VolumeConfig],
|
|
167
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
168
|
-
"""Gets the usedby resources of all volumes.
|
|
172
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
173
|
+
"""Gets the usedby resources of all volumes.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
configs: List of VolumeConfig objects.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
usedby_pods: Dictionary of context to namespace to volume name to pods
|
|
180
|
+
using the volume. These may include pods not created by
|
|
181
|
+
SkyPilot.
|
|
182
|
+
usedby_clusters: Dictionary of context to namespace to volume name to
|
|
183
|
+
clusters using the volume.
|
|
184
|
+
failed_volume_names: Set of volume names whose usedby info failed to
|
|
185
|
+
fetch.
|
|
186
|
+
"""
|
|
169
187
|
field_selector = ','.join([
|
|
170
188
|
f'status.phase!={phase}'
|
|
171
189
|
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
|
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
|
|
|
173
191
|
label_selector = 'parent=skypilot'
|
|
174
192
|
context_to_namespaces: Dict[str, Set[str]] = {}
|
|
175
193
|
pvc_names = set()
|
|
194
|
+
original_volume_names: Dict[str, Dict[str, List[str]]] = {}
|
|
176
195
|
for config in configs:
|
|
177
196
|
context, namespace = _get_context_namespace(config)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
197
|
+
context_to_namespaces.setdefault(context, set()).add(namespace)
|
|
198
|
+
original_volume_names.setdefault(context,
|
|
199
|
+
{}).setdefault(namespace,
|
|
200
|
+
[]).append(config.name)
|
|
181
201
|
pvc_names.add(config.name_on_cloud)
|
|
182
202
|
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
183
203
|
# Get all pods in the namespace
|
|
184
204
|
used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
185
205
|
used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
206
|
+
failed_volume_names: Set[str] = set()
|
|
186
207
|
for context, namespaces in context_to_namespaces.items():
|
|
187
208
|
used_by_pods[context] = {}
|
|
188
209
|
used_by_clusters[context] = {}
|
|
189
210
|
for namespace in namespaces:
|
|
190
211
|
used_by_pods[context][namespace] = {}
|
|
191
212
|
used_by_clusters[context][namespace] = {}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
213
|
+
try:
|
|
214
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
215
|
+
namespace=namespace,
|
|
216
|
+
field_selector=field_selector,
|
|
217
|
+
label_selector=label_selector,
|
|
218
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
219
|
+
except Exception as e: # pylint: disable=broad-except
|
|
220
|
+
logger.debug(f'Failed to get pods in namespace {namespace} '
|
|
221
|
+
f'in context {context}: {e}')
|
|
222
|
+
# Mark all volumes in this namespace as failed
|
|
223
|
+
for original_volume_name in original_volume_names[context][
|
|
224
|
+
namespace]:
|
|
225
|
+
failed_volume_names.add(original_volume_name)
|
|
226
|
+
continue
|
|
196
227
|
for pod in pods.items:
|
|
197
228
|
if pod.spec.volumes is None:
|
|
198
229
|
continue
|
|
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
|
|
|
217
248
|
used_by_clusters[context][namespace][cluster_name] = []
|
|
218
249
|
used_by_clusters[context][namespace][cluster_name].append(
|
|
219
250
|
cluster_name)
|
|
220
|
-
return used_by_pods, used_by_clusters
|
|
251
|
+
return used_by_pods, used_by_clusters, failed_volume_names
|
|
221
252
|
|
|
222
253
|
|
|
223
254
|
def map_all_volumes_usedby(
|
|
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
|
|
|
292
323
|
try:
|
|
293
324
|
pvc = kubernetes.core_api(
|
|
294
325
|
context).read_namespaced_persistent_volume_claim(
|
|
295
|
-
name=pvc_name,
|
|
326
|
+
name=pvc_name,
|
|
327
|
+
namespace=namespace,
|
|
328
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
296
329
|
if config is not None:
|
|
297
330
|
_populate_config_from_pvc(config, pvc)
|
|
298
331
|
logger.debug(f'PVC {pvc_name} already exists')
|
|
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
|
|
|
305
338
|
raise ValueError(
|
|
306
339
|
f'PVC {pvc_name} does not exist while use_existing is True.')
|
|
307
340
|
pvc = kubernetes.core_api(
|
|
308
|
-
context).create_namespaced_persistent_volume_claim(
|
|
309
|
-
|
|
341
|
+
context).create_namespaced_persistent_volume_claim(
|
|
342
|
+
namespace=namespace,
|
|
343
|
+
body=pvc_spec,
|
|
344
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
310
345
|
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
311
346
|
if config is not None:
|
|
312
347
|
_populate_config_from_pvc(config, pvc)
|
sky/provision/provisioner.py
CHANGED
|
@@ -157,9 +157,9 @@ def bulk_provision(
|
|
|
157
157
|
logger.debug(f'SkyPilot version: {sky.__version__}; '
|
|
158
158
|
f'commit: {sky.__commit__}')
|
|
159
159
|
logger.debug(_TITLE.format('Provisioning'))
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
160
|
+
redacted_config = bootstrap_config.get_redacted_config()
|
|
161
|
+
logger.debug('Provision config:\n'
|
|
162
|
+
f'{json.dumps(redacted_config, indent=2)}')
|
|
163
163
|
return _bulk_provision(cloud, region, cluster_name,
|
|
164
164
|
bootstrap_config)
|
|
165
165
|
except exceptions.NoClusterLaunchedError:
|
|
@@ -493,7 +493,8 @@ def _post_provision_setup(
|
|
|
493
493
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
494
494
|
# for the users to SSH into the pod.
|
|
495
495
|
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
496
|
-
|
|
496
|
+
is_slurm_cloud = cloud_name.lower() == 'slurm'
|
|
497
|
+
if not is_k8s_cloud and not is_slurm_cloud:
|
|
497
498
|
logger.debug(
|
|
498
499
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
499
500
|
wait_for_ssh(cluster_info, ssh_credentials)
|
|
@@ -635,10 +636,15 @@ def _post_provision_setup(
|
|
|
635
636
|
status.update(
|
|
636
637
|
runtime_preparation_str.format(step=3, step_name='runtime'))
|
|
637
638
|
|
|
639
|
+
skip_ray_setup = False
|
|
638
640
|
ray_port = constants.SKY_REMOTE_RAY_PORT
|
|
639
641
|
head_ray_needs_restart = True
|
|
640
642
|
ray_cluster_healthy = False
|
|
641
|
-
if (not
|
|
643
|
+
if (launched_resources.cloud is not None and
|
|
644
|
+
not launched_resources.cloud.uses_ray()):
|
|
645
|
+
skip_ray_setup = True
|
|
646
|
+
logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
|
|
647
|
+
elif (not provision_record.is_instance_just_booted(
|
|
642
648
|
head_instance.instance_id)):
|
|
643
649
|
# Check if head node Ray is alive
|
|
644
650
|
(ray_port, ray_cluster_healthy,
|
|
@@ -663,7 +669,9 @@ def _post_provision_setup(
|
|
|
663
669
|
'async setup to complete...')
|
|
664
670
|
time.sleep(1)
|
|
665
671
|
|
|
666
|
-
if
|
|
672
|
+
if skip_ray_setup:
|
|
673
|
+
logger.debug('Skip Ray cluster setup on the head node.')
|
|
674
|
+
elif head_ray_needs_restart:
|
|
667
675
|
logger.debug('Starting Ray on the entire cluster.')
|
|
668
676
|
instance_setup.start_ray_on_head_node(
|
|
669
677
|
cluster_name.name_on_cloud,
|
|
@@ -686,7 +694,9 @@ def _post_provision_setup(
|
|
|
686
694
|
# We don't need to restart ray on worker nodes if the ray cluster is
|
|
687
695
|
# already healthy, i.e. the head node has expected number of nodes
|
|
688
696
|
# connected to the ray cluster.
|
|
689
|
-
if
|
|
697
|
+
if skip_ray_setup:
|
|
698
|
+
logger.debug('Skip Ray cluster setup on the worker nodes.')
|
|
699
|
+
elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
|
|
690
700
|
instance_setup.start_ray_on_worker_nodes(
|
|
691
701
|
cluster_name.name_on_cloud,
|
|
692
702
|
no_restart=not head_ray_needs_restart,
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
+
import traceback
|
|
3
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
4
5
|
|
|
5
6
|
from sky import sky_logging
|
|
@@ -116,7 +117,8 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
116
117
|
volume_mount_path=volume_mount_path,
|
|
117
118
|
)
|
|
118
119
|
except Exception as e: # pylint: disable=broad-except
|
|
119
|
-
logger.warning(f'run_instances error: {e}'
|
|
120
|
+
logger.warning(f'run_instances error: {e}\n'
|
|
121
|
+
f'Full traceback:\n{traceback.format_exc()}')
|
|
120
122
|
raise
|
|
121
123
|
logger.info(f'Launched instance {instance_id}.')
|
|
122
124
|
created_instance_ids.append(instance_id)
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -80,7 +80,11 @@ def _construct_docker_login_template_name(cluster_name: str) -> str:
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
def retry(func):
|
|
83
|
-
"""Decorator to retry a function.
|
|
83
|
+
"""Decorator to retry a function.
|
|
84
|
+
|
|
85
|
+
Only retries on transient errors. Does not retry on authorization errors
|
|
86
|
+
(Unauthorized, Forbidden) as these are not recoverable.
|
|
87
|
+
"""
|
|
84
88
|
|
|
85
89
|
def wrapper(*args, **kwargs):
|
|
86
90
|
"""Wrapper for retrying a function."""
|
|
@@ -89,6 +93,14 @@ def retry(func):
|
|
|
89
93
|
try:
|
|
90
94
|
return func(*args, **kwargs)
|
|
91
95
|
except runpod.runpod.error.QueryError as e:
|
|
96
|
+
error_msg = str(e).lower()
|
|
97
|
+
# Don't retry on authorization errors - these won't recover
|
|
98
|
+
auth_keywords = ['unauthorized', 'forbidden', '401', '403']
|
|
99
|
+
if any(keyword in error_msg for keyword in auth_keywords):
|
|
100
|
+
logger.error(f'RunPod authorization error (not retrying): '
|
|
101
|
+
f'{common_utils.format_exception(e)}')
|
|
102
|
+
raise
|
|
103
|
+
cnt += 1
|
|
92
104
|
if cnt >= 3:
|
|
93
105
|
raise
|
|
94
106
|
logger.warning('Retrying for exception: '
|
sky/provision/runpod/volume.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""RunPod network volume provisioning."""
|
|
2
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
3
3
|
|
|
4
4
|
from sky import global_user_state
|
|
5
5
|
from sky import models
|
|
@@ -194,15 +194,31 @@ def get_volume_usedby(
|
|
|
194
194
|
|
|
195
195
|
def get_all_volumes_usedby(
|
|
196
196
|
configs: List[models.VolumeConfig],
|
|
197
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
198
|
-
"""Gets the usedby resources of all volumes.
|
|
199
|
-
|
|
197
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
198
|
+
"""Gets the usedby resources of all volumes.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
configs: List of VolumeConfig objects.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
usedby_pods: Dictionary of volume name to pods using the volume.
|
|
205
|
+
usedby_clusters: Dictionary of volume name to clusters using the volume.
|
|
206
|
+
failed_volume_names: Set of volume names whose usedby info failed to
|
|
207
|
+
fetch.
|
|
208
|
+
"""
|
|
200
209
|
used_by_pods, used_by_clusters = {}, {}
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
210
|
+
failed_volume_names = set()
|
|
211
|
+
for config in configs:
|
|
212
|
+
try:
|
|
213
|
+
usedby_pods, usedby_clusters = get_volume_usedby(config)
|
|
214
|
+
used_by_pods[config.name_on_cloud] = usedby_pods
|
|
215
|
+
used_by_clusters[config.name_on_cloud] = usedby_clusters
|
|
216
|
+
except Exception as e: # pylint: disable=broad-except
|
|
217
|
+
logger.debug(f'Failed to get usedby info for RunPod volume '
|
|
218
|
+
f'{config.name}: {e}')
|
|
219
|
+
failed_volume_names.add(config.name)
|
|
220
|
+
continue
|
|
221
|
+
return used_by_pods, used_by_clusters, failed_volume_names
|
|
206
222
|
|
|
207
223
|
|
|
208
224
|
def map_all_volumes_usedby(
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Slurm provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.slurm.config import bootstrap_instances
|
|
4
|
+
from sky.provision.slurm.instance import cleanup_ports
|
|
5
|
+
from sky.provision.slurm.instance import get_cluster_info
|
|
6
|
+
from sky.provision.slurm.instance import get_command_runners
|
|
7
|
+
from sky.provision.slurm.instance import open_ports
|
|
8
|
+
from sky.provision.slurm.instance import query_instances
|
|
9
|
+
from sky.provision.slurm.instance import run_instances
|
|
10
|
+
from sky.provision.slurm.instance import stop_instances
|
|
11
|
+
from sky.provision.slurm.instance import terminate_instances
|
|
12
|
+
from sky.provision.slurm.instance import wait_instances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Slrum-specific configuration for the provisioner."""
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from sky.provision import common
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bootstrap_instances(
|
|
10
|
+
region: str, cluster_name: str,
|
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
12
|
+
del region, cluster_name # unused
|
|
13
|
+
return config
|