skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -80,9 +80,8 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
|
80
80
|
|
|
81
81
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
82
82
|
|
|
83
|
-
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
84
|
-
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
85
83
|
_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
|
|
84
|
+
JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS = 60
|
|
86
85
|
|
|
87
86
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
88
87
|
'Waiting for task to start[/]'
|
|
@@ -329,13 +328,21 @@ def ha_recovery_for_consolidation_mode() -> None:
|
|
|
329
328
|
|
|
330
329
|
|
|
331
330
|
async def get_job_status(
|
|
332
|
-
|
|
333
|
-
|
|
331
|
+
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
332
|
+
job_id: Optional[int]
|
|
333
|
+
) -> Tuple[Optional['job_lib.JobStatus'], Optional[str]]:
|
|
334
334
|
"""Check the status of the job running on a managed job cluster.
|
|
335
335
|
|
|
336
336
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
337
337
|
FAILED_SETUP or CANCELLED.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
job_status: The status of the job.
|
|
341
|
+
transient_error_reason: None if successful or fatal error; otherwise,
|
|
342
|
+
the detailed reason for the transient error.
|
|
338
343
|
"""
|
|
344
|
+
# TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
|
|
345
|
+
# that it can exit retry early if the cluster is down.
|
|
339
346
|
# TODO(luca) make this async
|
|
340
347
|
handle = await context_utils.to_thread(
|
|
341
348
|
global_user_state.get_handle_from_cluster_name, cluster_name)
|
|
@@ -343,85 +350,68 @@ async def get_job_status(
|
|
|
343
350
|
# This can happen if the cluster was preempted and background status
|
|
344
351
|
# refresh already noticed and cleaned it up.
|
|
345
352
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
346
|
-
return None
|
|
353
|
+
return None, None
|
|
347
354
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
348
355
|
job_ids = None if job_id is None else [job_id]
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
356
|
+
try:
|
|
357
|
+
logger.info('=== Checking the job status... ===')
|
|
358
|
+
statuses = await asyncio.wait_for(
|
|
359
|
+
context_utils.to_thread(backend.get_job_status,
|
|
360
|
+
handle,
|
|
361
|
+
job_ids=job_ids,
|
|
362
|
+
stream_logs=False),
|
|
363
|
+
timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
|
|
364
|
+
status = list(statuses.values())[0]
|
|
365
|
+
if status is None:
|
|
366
|
+
logger.info('No job found.')
|
|
367
|
+
else:
|
|
368
|
+
logger.info(f'Job status: {status}')
|
|
369
|
+
logger.info('=' * 34)
|
|
370
|
+
return status, None
|
|
371
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
372
|
+
ValueError, TypeError, asyncio.TimeoutError) as e:
|
|
373
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
374
|
+
# limit how we handle it and whether or not we catch it.
|
|
375
|
+
potential_transient_error_reason = None
|
|
376
|
+
if isinstance(e, exceptions.CommandError):
|
|
377
|
+
returncode = e.returncode
|
|
378
|
+
potential_transient_error_reason = (f'Returncode: {returncode}. '
|
|
379
|
+
f'{e.detailed_reason}')
|
|
380
|
+
elif isinstance(e, grpc.RpcError):
|
|
381
|
+
potential_transient_error_reason = e.details()
|
|
382
|
+
elif isinstance(e, grpc.FutureTimeoutError):
|
|
383
|
+
potential_transient_error_reason = 'grpc timeout'
|
|
384
|
+
elif isinstance(e, asyncio.TimeoutError):
|
|
385
|
+
potential_transient_error_reason = (
|
|
386
|
+
'Job status check timed out after '
|
|
387
|
+
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
388
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
389
|
+
elif isinstance(e, ValueError):
|
|
390
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
391
|
+
# SSH credentials, we could see this. See
|
|
392
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
393
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
394
|
+
potential_transient_error_reason = 'Cluster yaml was deleted'
|
|
361
395
|
else:
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
#
|
|
368
|
-
#
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
elif isinstance(e, grpc.RpcError):
|
|
379
|
-
detailed_reason = e.details()
|
|
380
|
-
if e.code() in [
|
|
381
|
-
grpc.StatusCode.UNAVAILABLE,
|
|
382
|
-
grpc.StatusCode.DEADLINE_EXCEEDED
|
|
383
|
-
]:
|
|
384
|
-
is_transient_error = True
|
|
385
|
-
elif isinstance(e, grpc.FutureTimeoutError):
|
|
386
|
-
detailed_reason = 'Timeout'
|
|
387
|
-
elif isinstance(e, asyncio.TimeoutError):
|
|
388
|
-
detailed_reason = ('Job status check timed out after '
|
|
389
|
-
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
390
|
-
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
391
|
-
elif isinstance(e, ValueError):
|
|
392
|
-
# If the cluster yaml is deleted in the middle of getting the
|
|
393
|
-
# SSH credentials, we could see this. See
|
|
394
|
-
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
395
|
-
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
396
|
-
detailed_reason = 'Cluster yaml was deleted'
|
|
397
|
-
else:
|
|
398
|
-
raise
|
|
399
|
-
elif isinstance(e, TypeError):
|
|
400
|
-
# We will grab the SSH credentials from the cluster yaml, but if
|
|
401
|
-
# handle.cluster_yaml is None, we will just return an empty dict
|
|
402
|
-
# for the credentials. See
|
|
403
|
-
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
404
|
-
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
405
|
-
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
406
|
-
# TypeError if the cluster yaml is removed from the handle right
|
|
407
|
-
# when we pull it before the cluster is fully deleted.
|
|
408
|
-
error_msg_to_check = (
|
|
409
|
-
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
410
|
-
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
411
|
-
if str(e) == error_msg_to_check:
|
|
412
|
-
detailed_reason = 'SSH credentials were already cleaned up'
|
|
413
|
-
else:
|
|
414
|
-
raise
|
|
415
|
-
if is_transient_error:
|
|
416
|
-
logger.info('Failed to connect to the cluster. Retrying '
|
|
417
|
-
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
418
|
-
logger.info('=' * 34)
|
|
419
|
-
await asyncio.sleep(1)
|
|
396
|
+
raise
|
|
397
|
+
elif isinstance(e, TypeError):
|
|
398
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
399
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
400
|
+
# for the credentials. See
|
|
401
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
402
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
403
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
404
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
405
|
+
# when we pull it before the cluster is fully deleted.
|
|
406
|
+
error_msg_to_check = (
|
|
407
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
408
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
409
|
+
if str(e) == error_msg_to_check:
|
|
410
|
+
potential_transient_error_reason = ('SSH credentials were '
|
|
411
|
+
'already cleaned up')
|
|
420
412
|
else:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
return None
|
|
424
|
-
return None
|
|
413
|
+
raise
|
|
414
|
+
return None, potential_transient_error_reason
|
|
425
415
|
|
|
426
416
|
|
|
427
417
|
def controller_process_alive(record: managed_job_state.ControllerPidRecord,
|
|
@@ -1570,6 +1560,7 @@ def get_managed_job_queue(
|
|
|
1570
1560
|
handle.launched_resources.region,
|
|
1571
1561
|
handle.launched_resources.zone).formatted_str()
|
|
1572
1562
|
job['accelerators'] = handle.launched_resources.accelerators
|
|
1563
|
+
job['labels'] = handle.launched_resources.labels
|
|
1573
1564
|
else:
|
|
1574
1565
|
# FIXME(zongheng): display the last cached values for these.
|
|
1575
1566
|
job['cluster_resources'] = '-'
|
|
@@ -1578,6 +1569,7 @@ def get_managed_job_queue(
|
|
|
1578
1569
|
job['region'] = '-'
|
|
1579
1570
|
job['zone'] = '-'
|
|
1580
1571
|
job['infra'] = '-'
|
|
1572
|
+
job['labels'] = None
|
|
1581
1573
|
|
|
1582
1574
|
if not fields or 'details' in fields:
|
|
1583
1575
|
# Add details about schedule state / backoff.
|
|
@@ -1821,7 +1813,8 @@ def format_job_table(
|
|
|
1821
1813
|
for replica in replica_info:
|
|
1822
1814
|
used_by = replica.get('used_by')
|
|
1823
1815
|
if used_by is not None:
|
|
1824
|
-
|
|
1816
|
+
for job_id in used_by:
|
|
1817
|
+
job_to_worker[job_id] = replica.get('replica_id')
|
|
1825
1818
|
return job_to_worker
|
|
1826
1819
|
|
|
1827
1820
|
# Create mapping from job_id to worker replica_id
|
sky/models.py
CHANGED
|
@@ -68,6 +68,17 @@ class KubernetesNodeInfo:
|
|
|
68
68
|
free: Dict[str, int]
|
|
69
69
|
# IP address of the node (external IP preferred, fallback to internal IP)
|
|
70
70
|
ip_address: Optional[str] = None
|
|
71
|
+
# CPU count (total CPUs available on the node)
|
|
72
|
+
cpu_count: Optional[float] = None
|
|
73
|
+
# Memory in GB (total memory available on the node)
|
|
74
|
+
memory_gb: Optional[float] = None
|
|
75
|
+
# Free CPU count (free CPUs available on the node after pod allocations)
|
|
76
|
+
cpu_free: Optional[float] = None
|
|
77
|
+
# Free memory in GB (free memory available on the node after pod
|
|
78
|
+
# allocations)
|
|
79
|
+
memory_free_gb: Optional[float] = None
|
|
80
|
+
# Whether the node is ready (all conditions are satisfied)
|
|
81
|
+
is_ready: bool = True
|
|
71
82
|
|
|
72
83
|
|
|
73
84
|
@dataclasses.dataclass
|
sky/optimizer.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
20
20
|
from sky.clouds import cloud as sky_cloud
|
|
21
21
|
from sky.usage import usage_lib
|
|
22
22
|
from sky.utils import common
|
|
23
|
+
from sky.utils import common_utils
|
|
23
24
|
from sky.utils import env_options
|
|
24
25
|
from sky.utils import log_utils
|
|
25
26
|
from sky.utils import registry
|
|
@@ -781,7 +782,7 @@ class Optimizer:
|
|
|
781
782
|
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
|
782
783
|
instance_type = resources.instance_type
|
|
783
784
|
assert instance_type is not None, 'Instance type must be specified'
|
|
784
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
785
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
785
786
|
instance_type = '-'
|
|
786
787
|
if resources.use_spot:
|
|
787
788
|
instance_type = ''
|
|
@@ -865,11 +866,12 @@ class Optimizer:
|
|
|
865
866
|
'use_spot': resources.use_spot
|
|
866
867
|
}
|
|
867
868
|
|
|
868
|
-
# Handle special case for Kubernetes and
|
|
869
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
869
|
+
# Handle special case for Kubernetes, SSH, and SLURM clouds
|
|
870
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
870
871
|
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
|
871
|
-
# context name, i.e. different Kubernetes clusters.
|
|
872
|
-
#
|
|
872
|
+
# context name, i.e. different Kubernetes clusters.
|
|
873
|
+
# Region for SLURM is the cluster name.
|
|
874
|
+
# We add region to the key to show all the clusters in the
|
|
873
875
|
# optimizer table for better UX.
|
|
874
876
|
|
|
875
877
|
if resources.cloud.__class__.__name__ == 'SSH':
|
|
@@ -1289,7 +1291,7 @@ def _check_specified_regions(task: task_lib.Task) -> None:
|
|
|
1289
1291
|
msg = f'Task{task_name} requires '
|
|
1290
1292
|
if region not in existing_contexts:
|
|
1291
1293
|
if is_ssh:
|
|
1292
|
-
infra_str = f'SSH/{
|
|
1294
|
+
infra_str = f'SSH/{common_utils.removeprefix(region, "ssh-")}'
|
|
1293
1295
|
else:
|
|
1294
1296
|
infra_str = f'Kubernetes/{region}'
|
|
1295
1297
|
logger.warning(f'{infra_str} is not enabled.')
|
sky/provision/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
|
|
|
6
6
|
import functools
|
|
7
7
|
import inspect
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple, Type
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Type
|
|
10
10
|
|
|
11
11
|
from sky import models
|
|
12
12
|
from sky import sky_logging
|
|
@@ -29,6 +29,7 @@ from sky.provision import runpod
|
|
|
29
29
|
from sky.provision import scp
|
|
30
30
|
from sky.provision import seeweb
|
|
31
31
|
from sky.provision import shadeform
|
|
32
|
+
from sky.provision import slurm
|
|
32
33
|
from sky.provision import ssh
|
|
33
34
|
from sky.provision import vast
|
|
34
35
|
from sky.provision import vsphere
|
|
@@ -151,16 +152,18 @@ def get_volume_usedby(
|
|
|
151
152
|
@_route_to_cloud_impl
|
|
152
153
|
def get_all_volumes_usedby(
|
|
153
154
|
provider_name: str, configs: List[models.VolumeConfig]
|
|
154
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
155
|
-
"""Get the usedby of
|
|
155
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
156
|
+
"""Get the usedby of all volumes.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
provider_name: Name of the provider.
|
|
160
|
+
configs: List of VolumeConfig objects.
|
|
156
161
|
|
|
157
162
|
Returns:
|
|
158
|
-
usedby_pods:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
for a volume and a key containing clusters using
|
|
163
|
-
the volume.
|
|
163
|
+
usedby_pods: Dict of usedby pods.
|
|
164
|
+
usedby_clusters: Dict of usedby clusters.
|
|
165
|
+
failed_volume_names: Set of volume names whose usedby info
|
|
166
|
+
failed to fetch.
|
|
164
167
|
"""
|
|
165
168
|
raise NotImplementedError
|
|
166
169
|
|
sky/provision/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
|
+
from sky.utils import config_utils
|
|
9
10
|
from sky.utils import env_options
|
|
10
11
|
from sky.utils import resources_utils
|
|
11
12
|
|
|
@@ -36,6 +37,13 @@ class StopFailoverError(Exception):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
# These fields are sensitive and should be redacted from the config for logging
|
|
41
|
+
# purposes.
|
|
42
|
+
SENSITIVE_FIELDS = [
|
|
43
|
+
('docker_config', 'docker_login_config', 'password'),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
39
47
|
@dataclasses.dataclass
|
|
40
48
|
class ProvisionConfig:
|
|
41
49
|
"""Configuration for provisioning."""
|
|
@@ -56,6 +64,18 @@ class ProvisionConfig:
|
|
|
56
64
|
# Optional ports to open on launch of the cluster.
|
|
57
65
|
ports_to_open_on_launch: Optional[List[int]]
|
|
58
66
|
|
|
67
|
+
def get_redacted_config(self) -> Dict[str, Any]:
|
|
68
|
+
"""Get the redacted config."""
|
|
69
|
+
config = dataclasses.asdict(self)
|
|
70
|
+
|
|
71
|
+
config_copy = config_utils.Config(config)
|
|
72
|
+
|
|
73
|
+
for field_list in SENSITIVE_FIELDS:
|
|
74
|
+
val = config_copy.get_nested(field_list, default_value=None)
|
|
75
|
+
if val is not None:
|
|
76
|
+
config_copy.set_nested(field_list, '<redacted>')
|
|
77
|
+
return dict(**config_copy)
|
|
78
|
+
|
|
59
79
|
|
|
60
80
|
# -------------------- output data model -------------------- #
|
|
61
81
|
|
sky/provision/docker_utils.py
CHANGED
|
@@ -176,6 +176,17 @@ def _with_interactive(cmd):
|
|
|
176
176
|
return ['bash', '--login', '-c', '-i', shlex.quote(force_interactive)]
|
|
177
177
|
|
|
178
178
|
|
|
179
|
+
def _redact_docker_password(cmd: str) -> str:
|
|
180
|
+
parts = shlex.split(cmd)
|
|
181
|
+
for i, part in enumerate(parts):
|
|
182
|
+
if part.startswith('--password'):
|
|
183
|
+
if part.startswith('--password='):
|
|
184
|
+
parts[i] = '--password=<redacted>'
|
|
185
|
+
elif i + 1 < len(parts):
|
|
186
|
+
parts[i + 1] = '<redacted>'
|
|
187
|
+
return ' '.join(parts)
|
|
188
|
+
|
|
189
|
+
|
|
179
190
|
# SkyPilot: New class to initialize docker containers on a remote node.
|
|
180
191
|
# Adopted from ray.autoscaler._private.command_runner.DockerCommandRunner.
|
|
181
192
|
class DockerInitializer:
|
|
@@ -219,7 +230,9 @@ class DockerInitializer:
|
|
|
219
230
|
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
220
231
|
f'-c {shlex.quote(cmd)}')
|
|
221
232
|
|
|
222
|
-
|
|
233
|
+
# Redact the password in the login command.
|
|
234
|
+
redacted_cmd = _redact_docker_password(cmd)
|
|
235
|
+
logger.debug(f'+ {redacted_cmd}')
|
|
223
236
|
start = time.time()
|
|
224
237
|
while True:
|
|
225
238
|
rc, stdout, stderr = self.runner.run(
|
|
@@ -251,7 +264,7 @@ class DockerInitializer:
|
|
|
251
264
|
break
|
|
252
265
|
subprocess_utils.handle_returncode(
|
|
253
266
|
rc,
|
|
254
|
-
|
|
267
|
+
redacted_cmd,
|
|
255
268
|
error_msg='Failed to run docker setup commands.',
|
|
256
269
|
stderr=stdout + stderr,
|
|
257
270
|
# Print out the error message if the command failed.
|