skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -80,9 +80,8 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
|
80
80
|
|
|
81
81
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
82
82
|
|
|
83
|
-
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
84
|
-
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
85
83
|
_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
|
|
84
|
+
JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS = 60
|
|
86
85
|
|
|
87
86
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
88
87
|
'Waiting for task to start[/]'
|
|
@@ -329,13 +328,21 @@ def ha_recovery_for_consolidation_mode() -> None:
|
|
|
329
328
|
|
|
330
329
|
|
|
331
330
|
async def get_job_status(
|
|
332
|
-
|
|
333
|
-
|
|
331
|
+
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
332
|
+
job_id: Optional[int]
|
|
333
|
+
) -> Tuple[Optional['job_lib.JobStatus'], Optional[str]]:
|
|
334
334
|
"""Check the status of the job running on a managed job cluster.
|
|
335
335
|
|
|
336
336
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
337
337
|
FAILED_SETUP or CANCELLED.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
job_status: The status of the job.
|
|
341
|
+
transient_error_reason: None if successful or fatal error; otherwise,
|
|
342
|
+
the detailed reason for the transient error.
|
|
338
343
|
"""
|
|
344
|
+
# TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
|
|
345
|
+
# that it can exit retry early if the cluster is down.
|
|
339
346
|
# TODO(luca) make this async
|
|
340
347
|
handle = await context_utils.to_thread(
|
|
341
348
|
global_user_state.get_handle_from_cluster_name, cluster_name)
|
|
@@ -343,85 +350,68 @@ async def get_job_status(
|
|
|
343
350
|
# This can happen if the cluster was preempted and background status
|
|
344
351
|
# refresh already noticed and cleaned it up.
|
|
345
352
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
346
|
-
return None
|
|
353
|
+
return None, None
|
|
347
354
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
348
355
|
job_ids = None if job_id is None else [job_id]
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
356
|
+
try:
|
|
357
|
+
logger.info('=== Checking the job status... ===')
|
|
358
|
+
statuses = await asyncio.wait_for(
|
|
359
|
+
context_utils.to_thread(backend.get_job_status,
|
|
360
|
+
handle,
|
|
361
|
+
job_ids=job_ids,
|
|
362
|
+
stream_logs=False),
|
|
363
|
+
timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
|
|
364
|
+
status = list(statuses.values())[0]
|
|
365
|
+
if status is None:
|
|
366
|
+
logger.info('No job found.')
|
|
367
|
+
else:
|
|
368
|
+
logger.info(f'Job status: {status}')
|
|
369
|
+
logger.info('=' * 34)
|
|
370
|
+
return status, None
|
|
371
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
372
|
+
ValueError, TypeError, asyncio.TimeoutError) as e:
|
|
373
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
374
|
+
# limit how we handle it and whether or not we catch it.
|
|
375
|
+
potential_transient_error_reason = None
|
|
376
|
+
if isinstance(e, exceptions.CommandError):
|
|
377
|
+
returncode = e.returncode
|
|
378
|
+
potential_transient_error_reason = (f'Returncode: {returncode}. '
|
|
379
|
+
f'{e.detailed_reason}')
|
|
380
|
+
elif isinstance(e, grpc.RpcError):
|
|
381
|
+
potential_transient_error_reason = e.details()
|
|
382
|
+
elif isinstance(e, grpc.FutureTimeoutError):
|
|
383
|
+
potential_transient_error_reason = 'grpc timeout'
|
|
384
|
+
elif isinstance(e, asyncio.TimeoutError):
|
|
385
|
+
potential_transient_error_reason = (
|
|
386
|
+
'Job status check timed out after '
|
|
387
|
+
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
388
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
389
|
+
elif isinstance(e, ValueError):
|
|
390
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
391
|
+
# SSH credentials, we could see this. See
|
|
392
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
393
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
394
|
+
potential_transient_error_reason = 'Cluster yaml was deleted'
|
|
361
395
|
else:
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
#
|
|
368
|
-
#
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
elif isinstance(e, grpc.RpcError):
|
|
379
|
-
detailed_reason = e.details()
|
|
380
|
-
if e.code() in [
|
|
381
|
-
grpc.StatusCode.UNAVAILABLE,
|
|
382
|
-
grpc.StatusCode.DEADLINE_EXCEEDED
|
|
383
|
-
]:
|
|
384
|
-
is_transient_error = True
|
|
385
|
-
elif isinstance(e, grpc.FutureTimeoutError):
|
|
386
|
-
detailed_reason = 'Timeout'
|
|
387
|
-
elif isinstance(e, asyncio.TimeoutError):
|
|
388
|
-
detailed_reason = ('Job status check timed out after '
|
|
389
|
-
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
390
|
-
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
391
|
-
elif isinstance(e, ValueError):
|
|
392
|
-
# If the cluster yaml is deleted in the middle of getting the
|
|
393
|
-
# SSH credentials, we could see this. See
|
|
394
|
-
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
395
|
-
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
396
|
-
detailed_reason = 'Cluster yaml was deleted'
|
|
397
|
-
else:
|
|
398
|
-
raise
|
|
399
|
-
elif isinstance(e, TypeError):
|
|
400
|
-
# We will grab the SSH credentials from the cluster yaml, but if
|
|
401
|
-
# handle.cluster_yaml is None, we will just return an empty dict
|
|
402
|
-
# for the credentials. See
|
|
403
|
-
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
404
|
-
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
405
|
-
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
406
|
-
# TypeError if the cluster yaml is removed from the handle right
|
|
407
|
-
# when we pull it before the cluster is fully deleted.
|
|
408
|
-
error_msg_to_check = (
|
|
409
|
-
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
410
|
-
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
411
|
-
if str(e) == error_msg_to_check:
|
|
412
|
-
detailed_reason = 'SSH credentials were already cleaned up'
|
|
413
|
-
else:
|
|
414
|
-
raise
|
|
415
|
-
if is_transient_error:
|
|
416
|
-
logger.info('Failed to connect to the cluster. Retrying '
|
|
417
|
-
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
418
|
-
logger.info('=' * 34)
|
|
419
|
-
await asyncio.sleep(1)
|
|
396
|
+
raise
|
|
397
|
+
elif isinstance(e, TypeError):
|
|
398
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
399
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
400
|
+
# for the credentials. See
|
|
401
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
402
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
403
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
404
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
405
|
+
# when we pull it before the cluster is fully deleted.
|
|
406
|
+
error_msg_to_check = (
|
|
407
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
408
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
409
|
+
if str(e) == error_msg_to_check:
|
|
410
|
+
potential_transient_error_reason = ('SSH credentials were '
|
|
411
|
+
'already cleaned up')
|
|
420
412
|
else:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
return None
|
|
424
|
-
return None
|
|
413
|
+
raise
|
|
414
|
+
return None, potential_transient_error_reason
|
|
425
415
|
|
|
426
416
|
|
|
427
417
|
def controller_process_alive(record: managed_job_state.ControllerPidRecord,
|
|
@@ -1570,6 +1560,7 @@ def get_managed_job_queue(
|
|
|
1570
1560
|
handle.launched_resources.region,
|
|
1571
1561
|
handle.launched_resources.zone).formatted_str()
|
|
1572
1562
|
job['accelerators'] = handle.launched_resources.accelerators
|
|
1563
|
+
job['labels'] = handle.launched_resources.labels
|
|
1573
1564
|
else:
|
|
1574
1565
|
# FIXME(zongheng): display the last cached values for these.
|
|
1575
1566
|
job['cluster_resources'] = '-'
|
|
@@ -1578,6 +1569,7 @@ def get_managed_job_queue(
|
|
|
1578
1569
|
job['region'] = '-'
|
|
1579
1570
|
job['zone'] = '-'
|
|
1580
1571
|
job['infra'] = '-'
|
|
1572
|
+
job['labels'] = None
|
|
1581
1573
|
|
|
1582
1574
|
if not fields or 'details' in fields:
|
|
1583
1575
|
# Add details about schedule state / backoff.
|
|
@@ -1821,7 +1813,8 @@ def format_job_table(
|
|
|
1821
1813
|
for replica in replica_info:
|
|
1822
1814
|
used_by = replica.get('used_by')
|
|
1823
1815
|
if used_by is not None:
|
|
1824
|
-
|
|
1816
|
+
for job_id in used_by:
|
|
1817
|
+
job_to_worker[job_id] = replica.get('replica_id')
|
|
1825
1818
|
return job_to_worker
|
|
1826
1819
|
|
|
1827
1820
|
# Create mapping from job_id to worker replica_id
|
sky/models.py
CHANGED
|
@@ -68,6 +68,15 @@ class KubernetesNodeInfo:
|
|
|
68
68
|
free: Dict[str, int]
|
|
69
69
|
# IP address of the node (external IP preferred, fallback to internal IP)
|
|
70
70
|
ip_address: Optional[str] = None
|
|
71
|
+
# CPU count (total CPUs available on the node)
|
|
72
|
+
cpu_count: Optional[float] = None
|
|
73
|
+
# Memory in GB (total memory available on the node)
|
|
74
|
+
memory_gb: Optional[float] = None
|
|
75
|
+
# Free CPU count (free CPUs available on the node after pod allocations)
|
|
76
|
+
cpu_free: Optional[float] = None
|
|
77
|
+
# Free memory in GB (free memory available on the node after pod
|
|
78
|
+
# allocations)
|
|
79
|
+
memory_free_gb: Optional[float] = None
|
|
71
80
|
# Whether the node is ready (all conditions are satisfied)
|
|
72
81
|
is_ready: bool = True
|
|
73
82
|
|
sky/optimizer.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
20
20
|
from sky.clouds import cloud as sky_cloud
|
|
21
21
|
from sky.usage import usage_lib
|
|
22
22
|
from sky.utils import common
|
|
23
|
+
from sky.utils import common_utils
|
|
23
24
|
from sky.utils import env_options
|
|
24
25
|
from sky.utils import log_utils
|
|
25
26
|
from sky.utils import registry
|
|
@@ -1290,7 +1291,7 @@ def _check_specified_regions(task: task_lib.Task) -> None:
|
|
|
1290
1291
|
msg = f'Task{task_name} requires '
|
|
1291
1292
|
if region not in existing_contexts:
|
|
1292
1293
|
if is_ssh:
|
|
1293
|
-
infra_str = f'SSH/{
|
|
1294
|
+
infra_str = f'SSH/{common_utils.removeprefix(region, "ssh-")}'
|
|
1294
1295
|
else:
|
|
1295
1296
|
infra_str = f'Kubernetes/{region}'
|
|
1296
1297
|
logger.warning(f'{infra_str} is not enabled.')
|
sky/provision/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
|
|
|
6
6
|
import functools
|
|
7
7
|
import inspect
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple, Type
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Type
|
|
10
10
|
|
|
11
11
|
from sky import models
|
|
12
12
|
from sky import sky_logging
|
|
@@ -152,16 +152,18 @@ def get_volume_usedby(
|
|
|
152
152
|
@_route_to_cloud_impl
|
|
153
153
|
def get_all_volumes_usedby(
|
|
154
154
|
provider_name: str, configs: List[models.VolumeConfig]
|
|
155
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
156
|
-
"""Get the usedby of
|
|
155
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
156
|
+
"""Get the usedby of all volumes.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
provider_name: Name of the provider.
|
|
160
|
+
configs: List of VolumeConfig objects.
|
|
157
161
|
|
|
158
162
|
Returns:
|
|
159
|
-
usedby_pods:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
for a volume and a key containing clusters using
|
|
164
|
-
the volume.
|
|
163
|
+
usedby_pods: Dict of usedby pods.
|
|
164
|
+
usedby_clusters: Dict of usedby clusters.
|
|
165
|
+
failed_volume_names: Set of volume names whose usedby info
|
|
166
|
+
failed to fetch.
|
|
165
167
|
"""
|
|
166
168
|
raise NotImplementedError
|
|
167
169
|
|
|
@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
|
|
|
144
144
|
DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
|
|
145
145
|
|
|
146
146
|
MEMORY_SIZE_UNITS = {
|
|
147
|
+
'm': 0.001,
|
|
147
148
|
'B': 1,
|
|
148
149
|
'K': 2**10,
|
|
149
150
|
'M': 2**20,
|
|
@@ -1331,12 +1332,20 @@ class V1Pod:
|
|
|
1331
1332
|
|
|
1332
1333
|
|
|
1333
1334
|
@_retry_on_error(resource_type='pod')
|
|
1334
|
-
def
|
|
1335
|
+
def get_allocated_resources_by_node(
|
|
1335
1336
|
*,
|
|
1336
1337
|
context: Optional[str] = None,
|
|
1337
|
-
) -> Dict[str, int]:
|
|
1338
|
-
"""Gets allocated GPU
|
|
1338
|
+
) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
|
|
1339
|
+
"""Gets allocated GPU, CPU, and memory by each node by fetching pods in
|
|
1339
1340
|
all namespaces in kubernetes cluster indicated by context.
|
|
1341
|
+
|
|
1342
|
+
This function combines GPU and CPU/memory allocation tracking into a single
|
|
1343
|
+
API call for better performance.
|
|
1344
|
+
|
|
1345
|
+
Returns:
|
|
1346
|
+
Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
|
|
1347
|
+
- allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
|
|
1348
|
+
- allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
|
|
1340
1349
|
"""
|
|
1341
1350
|
if context is None:
|
|
1342
1351
|
context = get_current_kube_config_context_name()
|
|
@@ -1355,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
|
|
|
1355
1364
|
field_selector=field_selector)
|
|
1356
1365
|
try:
|
|
1357
1366
|
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1367
|
+
allocated_cpu_memory_by_node: Dict[str, Tuple[
|
|
1368
|
+
float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
|
|
1358
1369
|
for item_dict in ijson.items(response,
|
|
1359
1370
|
'items.item',
|
|
1360
1371
|
buf_size=IJSON_BUFFER_SIZE):
|
|
1361
1372
|
pod = V1Pod.from_dict(item_dict)
|
|
1362
1373
|
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1363
1374
|
logger.debug(
|
|
1364
|
-
f'Excluding pod {pod.metadata.name} from
|
|
1375
|
+
f'Excluding pod {pod.metadata.name} from resource count '
|
|
1365
1376
|
f'calculations on node {pod.spec.node_name}')
|
|
1366
1377
|
continue
|
|
1367
|
-
|
|
1368
|
-
|
|
1378
|
+
if not pod.spec.node_name:
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1381
|
+
# Iterate over all the containers in the pod and sum the resources
|
|
1369
1382
|
pod_allocated_qty = 0
|
|
1383
|
+
pod_allocated_cpu = 0.0
|
|
1384
|
+
pod_allocated_memory_gb = 0.0
|
|
1370
1385
|
for container in pod.spec.containers:
|
|
1371
1386
|
if container.resources.requests:
|
|
1387
|
+
requests = container.resources.requests
|
|
1388
|
+
# Parse GPU
|
|
1372
1389
|
pod_allocated_qty += get_node_accelerator_count(
|
|
1373
|
-
context,
|
|
1374
|
-
|
|
1390
|
+
context, requests)
|
|
1391
|
+
# Parse CPU
|
|
1392
|
+
if 'cpu' in requests:
|
|
1393
|
+
pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
|
|
1394
|
+
requests['cpu'])
|
|
1395
|
+
# Parse memory
|
|
1396
|
+
if 'memory' in requests:
|
|
1397
|
+
pod_allocated_memory_gb += parse_memory_resource(
|
|
1398
|
+
requests['memory'], unit='G')
|
|
1399
|
+
|
|
1400
|
+
if pod_allocated_qty > 0:
|
|
1375
1401
|
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1376
|
-
|
|
1402
|
+
if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
|
|
1403
|
+
current_cpu, current_memory = allocated_cpu_memory_by_node[
|
|
1404
|
+
pod.spec.node_name]
|
|
1405
|
+
allocated_cpu_memory_by_node[pod.spec.node_name] = (
|
|
1406
|
+
current_cpu + pod_allocated_cpu,
|
|
1407
|
+
current_memory + pod_allocated_memory_gb)
|
|
1408
|
+
return allocated_qty_by_node, allocated_cpu_memory_by_node
|
|
1377
1409
|
finally:
|
|
1378
1410
|
response.release_conn()
|
|
1379
1411
|
|
|
1380
1412
|
|
|
1413
|
+
@_retry_on_error(resource_type='pod')
|
|
1414
|
+
def get_allocated_gpu_qty_by_node(
|
|
1415
|
+
*,
|
|
1416
|
+
context: Optional[str] = None,
|
|
1417
|
+
) -> Dict[str, int]:
|
|
1418
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1419
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
1420
|
+
|
|
1421
|
+
Note: For better performance when you also need CPU/memory allocation,
|
|
1422
|
+
use get_allocated_resources_by_node() instead.
|
|
1423
|
+
"""
|
|
1424
|
+
allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
|
|
1425
|
+
return allocated_qty_by_node
|
|
1426
|
+
|
|
1427
|
+
|
|
1381
1428
|
def check_instance_fits(context: Optional[str],
|
|
1382
1429
|
instance: str) -> Tuple[bool, Optional[str]]:
|
|
1383
1430
|
"""Checks if the instance fits on the Kubernetes cluster.
|
|
@@ -2189,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
|
|
|
2189
2236
|
_, current_context = kubernetes.list_kube_config_contexts()
|
|
2190
2237
|
return current_context['name']
|
|
2191
2238
|
except k8s.config.config_exception.ConfigException:
|
|
2239
|
+
# If kubeconfig is not available, check if running in-cluster and
|
|
2240
|
+
# return the in-cluster context name. This is needed when kubeconfig
|
|
2241
|
+
# is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
|
|
2242
|
+
# but we still need to know the context name for operations like
|
|
2243
|
+
# port mode detection.
|
|
2244
|
+
if is_incluster_config_available():
|
|
2245
|
+
return kubernetes.in_cluster_context_name()
|
|
2192
2246
|
return None
|
|
2193
2247
|
|
|
2194
2248
|
|
|
@@ -2313,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
|
|
|
2313
2367
|
try:
|
|
2314
2368
|
bytes_value = int(resource_str)
|
|
2315
2369
|
except ValueError:
|
|
2316
|
-
memory_size = re.sub(r'([
|
|
2370
|
+
memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
|
|
2317
2371
|
number, unit_index = [item.strip() for item in memory_size.split()]
|
|
2318
2372
|
unit_index = unit_index[0]
|
|
2319
2373
|
bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
|
|
@@ -3061,16 +3115,32 @@ def get_kubernetes_node_info(
|
|
|
3061
3115
|
has_accelerator_nodes = True
|
|
3062
3116
|
break
|
|
3063
3117
|
|
|
3064
|
-
# Get the allocated GPU
|
|
3118
|
+
# Get the allocated resources (GPU, CPU, memory) by each node in a single call
|
|
3065
3119
|
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
3066
|
-
|
|
3120
|
+
allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
|
|
3121
|
+
error_on_get_allocated_resources = False
|
|
3122
|
+
# Get resource allocation. For GPU allocation, only call if there are GPU nodes
|
|
3123
|
+
# (same as master branch). For CPU/memory, we always need it for all nodes.
|
|
3067
3124
|
if has_accelerator_nodes:
|
|
3125
|
+
# When there are GPU nodes, get both GPU and CPU/memory in one call
|
|
3068
3126
|
try:
|
|
3069
|
-
allocated_qty_by_node =
|
|
3127
|
+
allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
|
|
3070
3128
|
context=context)
|
|
3071
3129
|
except kubernetes.api_exception() as e:
|
|
3072
3130
|
if e.status == 403:
|
|
3073
|
-
|
|
3131
|
+
error_on_get_allocated_resources = True
|
|
3132
|
+
pass
|
|
3133
|
+
else:
|
|
3134
|
+
raise
|
|
3135
|
+
else:
|
|
3136
|
+
# When there are no GPU nodes, we still need CPU/memory allocation
|
|
3137
|
+
# This is an extra API call compared to master branch
|
|
3138
|
+
try:
|
|
3139
|
+
_, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
|
|
3140
|
+
context=context)
|
|
3141
|
+
except kubernetes.api_exception() as e:
|
|
3142
|
+
if e.status == 403:
|
|
3143
|
+
error_on_get_allocated_resources = True
|
|
3074
3144
|
pass
|
|
3075
3145
|
else:
|
|
3076
3146
|
raise
|
|
@@ -3106,6 +3176,35 @@ def get_kubernetes_node_info(
|
|
|
3106
3176
|
|
|
3107
3177
|
accelerator_count = get_node_accelerator_count(context,
|
|
3108
3178
|
node.status.allocatable)
|
|
3179
|
+
|
|
3180
|
+
# Parse CPU and memory from node capacity
|
|
3181
|
+
cpu_count = None
|
|
3182
|
+
memory_gb = None
|
|
3183
|
+
try:
|
|
3184
|
+
if 'cpu' in node.status.capacity:
|
|
3185
|
+
cpu_count = float(
|
|
3186
|
+
parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
|
|
3187
|
+
if 'memory' in node.status.capacity:
|
|
3188
|
+
memory_gb = parse_memory_resource(
|
|
3189
|
+
node.status.capacity['memory'], unit='G')
|
|
3190
|
+
except (KeyError, ValueError) as e:
|
|
3191
|
+
# If parsing fails, log but continue
|
|
3192
|
+
logger.debug(f'Failed to parse CPU/memory for node '
|
|
3193
|
+
f'{node.metadata.name}: {e}')
|
|
3194
|
+
|
|
3195
|
+
# Calculate free CPU and memory
|
|
3196
|
+
cpu_free = None
|
|
3197
|
+
memory_free_gb = None
|
|
3198
|
+
if cpu_count is not None or memory_gb is not None:
|
|
3199
|
+
if not error_on_get_allocated_resources:
|
|
3200
|
+
allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
|
|
3201
|
+
node.metadata.name, (0.0, 0.0))
|
|
3202
|
+
if cpu_count is not None:
|
|
3203
|
+
cpu_free = max(0.0, cpu_count - allocated_cpu)
|
|
3204
|
+
if memory_gb is not None:
|
|
3205
|
+
memory_free_gb = max(0.0, memory_gb - allocated_memory)
|
|
3206
|
+
# If we can't get allocation info, set free to None (unknown)
|
|
3207
|
+
|
|
3109
3208
|
# Check if node is ready
|
|
3110
3209
|
node_is_ready = node.is_ready()
|
|
3111
3210
|
|
|
@@ -3116,13 +3215,17 @@ def get_kubernetes_node_info(
|
|
|
3116
3215
|
total={'accelerator_count': 0},
|
|
3117
3216
|
free={'accelerators_available': 0},
|
|
3118
3217
|
ip_address=node_ip,
|
|
3218
|
+
cpu_count=cpu_count,
|
|
3219
|
+
memory_gb=memory_gb,
|
|
3220
|
+
cpu_free=cpu_free,
|
|
3221
|
+
memory_free_gb=memory_free_gb,
|
|
3119
3222
|
is_ready=node_is_ready)
|
|
3120
3223
|
continue
|
|
3121
3224
|
|
|
3122
3225
|
if not node_is_ready:
|
|
3123
3226
|
# If node is not ready, report 0 available GPUs
|
|
3124
3227
|
accelerators_available = 0
|
|
3125
|
-
elif not has_accelerator_nodes or
|
|
3228
|
+
elif not has_accelerator_nodes or error_on_get_allocated_resources:
|
|
3126
3229
|
accelerators_available = -1
|
|
3127
3230
|
else:
|
|
3128
3231
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
@@ -3141,6 +3244,10 @@ def get_kubernetes_node_info(
|
|
|
3141
3244
|
total={'accelerator_count': int(accelerator_count)},
|
|
3142
3245
|
free={'accelerators_available': int(accelerators_available)},
|
|
3143
3246
|
ip_address=node_ip,
|
|
3247
|
+
cpu_count=cpu_count,
|
|
3248
|
+
memory_gb=memory_gb,
|
|
3249
|
+
cpu_free=cpu_free,
|
|
3250
|
+
memory_free_gb=memory_free_gb,
|
|
3144
3251
|
is_ready=node_is_ready)
|
|
3145
3252
|
hint = ''
|
|
3146
3253
|
if has_multi_host_tpu:
|
|
@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
|
|
45
45
|
continue
|
|
46
46
|
pvc = kubernetes.core_api(
|
|
47
47
|
context).read_namespaced_persistent_volume_claim(
|
|
48
|
-
name=pvc_name,
|
|
48
|
+
name=pvc_name,
|
|
49
|
+
namespace=namespace,
|
|
50
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
49
51
|
access_mode = pvc.spec.access_modes[0]
|
|
50
52
|
if access_mode not in once_modes:
|
|
51
53
|
continue
|
|
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
65
67
|
if storage_class_name is not None:
|
|
66
68
|
try:
|
|
67
69
|
kubernetes.storage_api(context).read_storage_class(
|
|
68
|
-
name=storage_class_name
|
|
70
|
+
name=storage_class_name,
|
|
71
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
69
72
|
except kubernetes.api_exception() as e:
|
|
70
73
|
raise config_lib.KubernetesError(
|
|
71
74
|
f'Check storage class {storage_class_name} error: {e}')
|
|
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
82
85
|
context).delete_namespaced_persistent_volume_claim(
|
|
83
86
|
name=pvc_name,
|
|
84
87
|
namespace=namespace,
|
|
85
|
-
_request_timeout=
|
|
88
|
+
_request_timeout=kubernetes.API_TIMEOUT),
|
|
86
89
|
resource_type='pvc',
|
|
87
90
|
resource_name=pvc_name)
|
|
88
91
|
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
@@ -119,7 +122,9 @@ def _get_volume_usedby(
|
|
|
119
122
|
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
120
123
|
# Get all pods in the namespace
|
|
121
124
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
122
|
-
namespace=namespace,
|
|
125
|
+
namespace=namespace,
|
|
126
|
+
field_selector=field_selector,
|
|
127
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
123
128
|
for pod in pods.items:
|
|
124
129
|
if pod.spec.volumes is None:
|
|
125
130
|
continue
|
|
@@ -164,8 +169,21 @@ def get_volume_usedby(
|
|
|
164
169
|
|
|
165
170
|
def get_all_volumes_usedby(
|
|
166
171
|
configs: List[models.VolumeConfig],
|
|
167
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
168
|
-
"""Gets the usedby resources of all volumes.
|
|
172
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
173
|
+
"""Gets the usedby resources of all volumes.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
configs: List of VolumeConfig objects.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
usedby_pods: Dictionary of context to namespace to volume name to pods
|
|
180
|
+
using the volume. These may include pods not created by
|
|
181
|
+
SkyPilot.
|
|
182
|
+
usedby_clusters: Dictionary of context to namespace to volume name to
|
|
183
|
+
clusters using the volume.
|
|
184
|
+
failed_volume_names: Set of volume names whose usedby info failed to
|
|
185
|
+
fetch.
|
|
186
|
+
"""
|
|
169
187
|
field_selector = ','.join([
|
|
170
188
|
f'status.phase!={phase}'
|
|
171
189
|
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
|
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
|
|
|
173
191
|
label_selector = 'parent=skypilot'
|
|
174
192
|
context_to_namespaces: Dict[str, Set[str]] = {}
|
|
175
193
|
pvc_names = set()
|
|
194
|
+
original_volume_names: Dict[str, Dict[str, List[str]]] = {}
|
|
176
195
|
for config in configs:
|
|
177
196
|
context, namespace = _get_context_namespace(config)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
197
|
+
context_to_namespaces.setdefault(context, set()).add(namespace)
|
|
198
|
+
original_volume_names.setdefault(context,
|
|
199
|
+
{}).setdefault(namespace,
|
|
200
|
+
[]).append(config.name)
|
|
181
201
|
pvc_names.add(config.name_on_cloud)
|
|
182
202
|
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
183
203
|
# Get all pods in the namespace
|
|
184
204
|
used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
185
205
|
used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
206
|
+
failed_volume_names: Set[str] = set()
|
|
186
207
|
for context, namespaces in context_to_namespaces.items():
|
|
187
208
|
used_by_pods[context] = {}
|
|
188
209
|
used_by_clusters[context] = {}
|
|
189
210
|
for namespace in namespaces:
|
|
190
211
|
used_by_pods[context][namespace] = {}
|
|
191
212
|
used_by_clusters[context][namespace] = {}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
213
|
+
try:
|
|
214
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
215
|
+
namespace=namespace,
|
|
216
|
+
field_selector=field_selector,
|
|
217
|
+
label_selector=label_selector,
|
|
218
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
219
|
+
except Exception as e: # pylint: disable=broad-except
|
|
220
|
+
logger.debug(f'Failed to get pods in namespace {namespace} '
|
|
221
|
+
f'in context {context}: {e}')
|
|
222
|
+
# Mark all volumes in this namespace as failed
|
|
223
|
+
for original_volume_name in original_volume_names[context][
|
|
224
|
+
namespace]:
|
|
225
|
+
failed_volume_names.add(original_volume_name)
|
|
226
|
+
continue
|
|
196
227
|
for pod in pods.items:
|
|
197
228
|
if pod.spec.volumes is None:
|
|
198
229
|
continue
|
|
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
|
|
|
217
248
|
used_by_clusters[context][namespace][cluster_name] = []
|
|
218
249
|
used_by_clusters[context][namespace][cluster_name].append(
|
|
219
250
|
cluster_name)
|
|
220
|
-
return used_by_pods, used_by_clusters
|
|
251
|
+
return used_by_pods, used_by_clusters, failed_volume_names
|
|
221
252
|
|
|
222
253
|
|
|
223
254
|
def map_all_volumes_usedby(
|
|
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
|
|
|
292
323
|
try:
|
|
293
324
|
pvc = kubernetes.core_api(
|
|
294
325
|
context).read_namespaced_persistent_volume_claim(
|
|
295
|
-
name=pvc_name,
|
|
326
|
+
name=pvc_name,
|
|
327
|
+
namespace=namespace,
|
|
328
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
296
329
|
if config is not None:
|
|
297
330
|
_populate_config_from_pvc(config, pvc)
|
|
298
331
|
logger.debug(f'PVC {pvc_name} already exists')
|
|
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
|
|
|
305
338
|
raise ValueError(
|
|
306
339
|
f'PVC {pvc_name} does not exist while use_existing is True.')
|
|
307
340
|
pvc = kubernetes.core_api(
|
|
308
|
-
context).create_namespaced_persistent_volume_claim(
|
|
309
|
-
|
|
341
|
+
context).create_namespaced_persistent_volume_claim(
|
|
342
|
+
namespace=namespace,
|
|
343
|
+
body=pvc_spec,
|
|
344
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
310
345
|
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
311
346
|
if config is not None:
|
|
312
347
|
_populate_config_from_pvc(config, pvc)
|
sky/provision/provisioner.py
CHANGED
|
@@ -493,7 +493,8 @@ def _post_provision_setup(
|
|
|
493
493
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
494
494
|
# for the users to SSH into the pod.
|
|
495
495
|
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
496
|
-
|
|
496
|
+
is_slurm_cloud = cloud_name.lower() == 'slurm'
|
|
497
|
+
if not is_k8s_cloud and not is_slurm_cloud:
|
|
497
498
|
logger.debug(
|
|
498
499
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
499
500
|
wait_for_ssh(cluster_info, ssh_credentials)
|