skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/core.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
2
|
import typing
|
|
5
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
4
|
|
|
@@ -8,7 +6,7 @@ import colorama
|
|
|
8
6
|
|
|
9
7
|
from sky import admin_policy
|
|
10
8
|
from sky import backends
|
|
11
|
-
from sky import
|
|
9
|
+
from sky import catalog
|
|
12
10
|
from sky import clouds
|
|
13
11
|
from sky import dag as dag_lib
|
|
14
12
|
from sky import data
|
|
@@ -17,21 +15,26 @@ from sky import global_user_state
|
|
|
17
15
|
from sky import models
|
|
18
16
|
from sky import optimizer
|
|
19
17
|
from sky import sky_logging
|
|
18
|
+
from sky import skypilot_config
|
|
20
19
|
from sky import task as task_lib
|
|
20
|
+
from sky.adaptors import common as adaptors_common
|
|
21
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.backends import cloud_vm_ray_backend
|
|
22
23
|
from sky.clouds import cloud as sky_cloud
|
|
23
|
-
from sky.clouds import service_catalog
|
|
24
24
|
from sky.jobs.server import core as managed_jobs_core
|
|
25
25
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
26
26
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
27
|
+
from sky.schemas.api import responses
|
|
28
|
+
from sky.server.requests import request_names
|
|
29
|
+
from sky.skylet import autostop_lib
|
|
27
30
|
from sky.skylet import constants
|
|
28
31
|
from sky.skylet import job_lib
|
|
29
|
-
from sky.skylet import log_lib
|
|
30
32
|
from sky.usage import usage_lib
|
|
31
33
|
from sky.utils import admin_policy_utils
|
|
32
34
|
from sky.utils import common
|
|
33
35
|
from sky.utils import common_utils
|
|
34
36
|
from sky.utils import controller_utils
|
|
37
|
+
from sky.utils import resources_utils
|
|
35
38
|
from sky.utils import rich_utils
|
|
36
39
|
from sky.utils import status_lib
|
|
37
40
|
from sky.utils import subprocess_utils
|
|
@@ -40,6 +43,9 @@ from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
|
40
43
|
|
|
41
44
|
if typing.TYPE_CHECKING:
|
|
42
45
|
from sky import resources as resources_lib
|
|
46
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
47
|
+
else:
|
|
48
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
43
49
|
|
|
44
50
|
logger = sky_logging.init_logger(__name__)
|
|
45
51
|
|
|
@@ -78,14 +84,15 @@ def optimize(
|
|
|
78
84
|
# is shown on `sky launch`. The optimizer is also invoked during failover,
|
|
79
85
|
# but we do not apply the admin policy there. We should apply the admin
|
|
80
86
|
# policy in the optimizer, but that will require some refactoring.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
88
|
+
dag,
|
|
89
|
+
request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
|
|
90
|
+
request_options=request_options) as dag:
|
|
91
|
+
dag.resolve_and_validate_volumes()
|
|
92
|
+
return optimizer.Optimizer.optimize(dag=dag,
|
|
93
|
+
minimize=minimize,
|
|
94
|
+
blocked_resources=blocked_resources,
|
|
95
|
+
quiet=quiet)
|
|
89
96
|
|
|
90
97
|
|
|
91
98
|
@usage_lib.entrypoint
|
|
@@ -93,7 +100,10 @@ def status(
|
|
|
93
100
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
94
101
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
95
102
|
all_users: bool = False,
|
|
96
|
-
|
|
103
|
+
include_credentials: bool = False,
|
|
104
|
+
summary_response: bool = False,
|
|
105
|
+
include_handle: bool = True,
|
|
106
|
+
) -> List[responses.StatusResponse]:
|
|
97
107
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
98
108
|
"""Gets cluster statuses.
|
|
99
109
|
|
|
@@ -160,22 +170,37 @@ def status(
|
|
|
160
170
|
provided, all clusters will be queried.
|
|
161
171
|
refresh: whether to query the latest cluster statuses from the cloud
|
|
162
172
|
provider(s).
|
|
173
|
+
include_credentials: whether to fetch ssh credentials for cluster
|
|
174
|
+
(credentials field in responses.StatusResponse)
|
|
163
175
|
|
|
164
176
|
Returns:
|
|
165
177
|
A list of dicts, with each dict containing the information of a
|
|
166
178
|
cluster. If a cluster is found to be terminated or not found, it will
|
|
167
179
|
be omitted from the returned list.
|
|
168
180
|
"""
|
|
169
|
-
clusters = backend_utils.get_clusters(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
181
|
+
clusters = backend_utils.get_clusters(
|
|
182
|
+
refresh=refresh,
|
|
183
|
+
cluster_names=cluster_names,
|
|
184
|
+
all_users=all_users,
|
|
185
|
+
include_credentials=include_credentials,
|
|
186
|
+
summary_response=summary_response,
|
|
187
|
+
include_handle=include_handle)
|
|
188
|
+
|
|
189
|
+
status_responses = []
|
|
190
|
+
for cluster in clusters:
|
|
191
|
+
try:
|
|
192
|
+
status_responses.append(
|
|
193
|
+
responses.StatusResponse.model_validate(cluster))
|
|
194
|
+
except Exception as e: # pylint: disable=broad-except
|
|
195
|
+
logger.warning('Failed to validate status responses for cluster '
|
|
196
|
+
f'{cluster.get("name")}: {e}')
|
|
197
|
+
return status_responses
|
|
173
198
|
|
|
174
199
|
|
|
175
200
|
def status_kubernetes(
|
|
176
201
|
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
177
202
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
178
|
-
List[
|
|
203
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
179
204
|
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
|
180
205
|
|
|
181
206
|
Managed jobs and services are also included in the clusters returned.
|
|
@@ -250,6 +275,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
|
250
275
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
|
251
276
|
for c in unmanaged_clusters
|
|
252
277
|
]
|
|
278
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
253
279
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
254
280
|
|
|
255
281
|
|
|
@@ -262,22 +288,26 @@ def endpoints(cluster: str,
|
|
|
262
288
|
port: The port number to get the endpoint for. If None, endpoints
|
|
263
289
|
for all ports are returned..
|
|
264
290
|
|
|
265
|
-
Returns: A dictionary of port numbers to endpoints. If
|
|
291
|
+
Returns: A dictionary of port numbers to endpoints. If port is None,
|
|
266
292
|
the dictionary will contain all ports:endpoints exposed on the cluster.
|
|
267
293
|
|
|
268
294
|
Raises:
|
|
269
|
-
|
|
295
|
+
ValueError: if the cluster is not UP or the endpoint is not exposed.
|
|
270
296
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
|
271
297
|
are exposed yet.
|
|
272
298
|
"""
|
|
273
299
|
with rich_utils.safe_status(
|
|
274
300
|
ux_utils.spinner_message(
|
|
275
301
|
f'Fetching endpoints for cluster {cluster}')):
|
|
276
|
-
|
|
302
|
+
result = backend_utils.get_endpoints(cluster=cluster, port=port)
|
|
303
|
+
return result
|
|
277
304
|
|
|
278
305
|
|
|
279
306
|
@usage_lib.entrypoint
|
|
280
|
-
def cost_report(
|
|
307
|
+
def cost_report(
|
|
308
|
+
days: Optional[int] = None,
|
|
309
|
+
dashboard_summary_response: bool = False,
|
|
310
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
281
311
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
282
312
|
"""Get all cluster cost reports, including those that have been downed.
|
|
283
313
|
|
|
@@ -295,6 +325,13 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
295
325
|
'cluster_hash': (str) unique hash identifying cluster,
|
|
296
326
|
'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
|
|
297
327
|
'total_cost': (float) cost given resources and usage intervals,
|
|
328
|
+
'cloud': (str) cloud of the cluster,
|
|
329
|
+
'region': (str) region of the cluster,
|
|
330
|
+
'cpus': (str) number of vCPUs of the cluster,
|
|
331
|
+
'memory': (str) memory of the cluster,
|
|
332
|
+
'accelerators': (str) accelerators of the cluster,
|
|
333
|
+
'resources_str': (str) resources string of the cluster,
|
|
334
|
+
'resources_str_full': (str) full resources string of the cluster,
|
|
298
335
|
}
|
|
299
336
|
|
|
300
337
|
The estimated cost column indicates price for the cluster based on the type
|
|
@@ -304,25 +341,103 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
304
341
|
cache of the cluster status, and may not be accurate for the cluster with
|
|
305
342
|
autostop/use_spot set or terminated/stopped on the cloud console.
|
|
306
343
|
|
|
344
|
+
Args:
|
|
345
|
+
days: Number of days to look back from now. Active clusters are always
|
|
346
|
+
included. Historical clusters are only included if they were last
|
|
347
|
+
used within the past 'days' days. Defaults to 30 days.
|
|
348
|
+
|
|
307
349
|
Returns:
|
|
308
350
|
A list of dicts, with each dict containing the cost information of a
|
|
309
351
|
cluster.
|
|
310
352
|
"""
|
|
311
|
-
|
|
353
|
+
if days is None:
|
|
354
|
+
days = constants.COST_REPORT_DEFAULT_DAYS
|
|
355
|
+
|
|
356
|
+
abbreviate_response = dashboard_summary_response and cluster_hashes is None
|
|
357
|
+
|
|
358
|
+
cluster_reports = global_user_state.get_clusters_from_history(
|
|
359
|
+
days=days,
|
|
360
|
+
abbreviate_response=abbreviate_response,
|
|
361
|
+
cluster_hashes=cluster_hashes)
|
|
362
|
+
logger.debug(
|
|
363
|
+
f'{len(cluster_reports)} clusters found from history with {days} days.')
|
|
364
|
+
|
|
365
|
+
def _process_cluster_report(
|
|
366
|
+
cluster_report: Dict[str, Any]) -> Dict[str, Any]:
|
|
367
|
+
"""Process cluster report by calculating cost and adding fields."""
|
|
368
|
+
# Make a copy to avoid modifying the original
|
|
369
|
+
report = cluster_report.copy()
|
|
312
370
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
371
|
+
def get_total_cost(cluster_report: dict) -> float:
|
|
372
|
+
duration = cluster_report['duration']
|
|
373
|
+
launched_nodes = cluster_report['num_nodes']
|
|
374
|
+
launched_resources = cluster_report['resources']
|
|
317
375
|
|
|
318
|
-
|
|
319
|
-
|
|
376
|
+
cost = (launched_resources.get_cost(duration) * launched_nodes)
|
|
377
|
+
return cost
|
|
320
378
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
'
|
|
379
|
+
try:
|
|
380
|
+
report['total_cost'] = get_total_cost(report)
|
|
381
|
+
except Exception as e: # pylint: disable=broad-except
|
|
382
|
+
# Ok to skip the total cost as this is just for display purposes.
|
|
383
|
+
logger.warning(f'Failed to get total cost for cluster '
|
|
384
|
+
f'{report["name"]}: {str(e)}')
|
|
385
|
+
report['total_cost'] = 0.0
|
|
386
|
+
|
|
387
|
+
return report
|
|
388
|
+
|
|
389
|
+
# Process clusters in parallel
|
|
390
|
+
if not cluster_reports:
|
|
391
|
+
return []
|
|
392
|
+
|
|
393
|
+
if not abbreviate_response:
|
|
394
|
+
cluster_reports = subprocess_utils.run_in_parallel(
|
|
395
|
+
_process_cluster_report, cluster_reports)
|
|
396
|
+
|
|
397
|
+
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
398
|
+
"""Add resource fields for dashboard compatibility."""
|
|
399
|
+
if record is None:
|
|
400
|
+
return
|
|
401
|
+
resources = record.get('resources')
|
|
402
|
+
if resources is None:
|
|
403
|
+
return
|
|
404
|
+
if not dashboard_summary_response:
|
|
405
|
+
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
406
|
+
else:
|
|
407
|
+
fields = ['cloud']
|
|
408
|
+
for field in fields:
|
|
409
|
+
try:
|
|
410
|
+
record[field] = str(getattr(resources, field))
|
|
411
|
+
except Exception as e: # pylint: disable=broad-except
|
|
412
|
+
# Ok to skip the fields as this is just for display
|
|
413
|
+
# purposes.
|
|
414
|
+
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
415
|
+
f'{record["name"]}: {str(e)}')
|
|
416
|
+
record[field] = None
|
|
417
|
+
|
|
418
|
+
# Add resources_str and resources_str_full for dashboard
|
|
419
|
+
# compatibility
|
|
420
|
+
num_nodes = record.get('num_nodes', 1)
|
|
421
|
+
try:
|
|
422
|
+
resource_str_simple, resource_str_full = (
|
|
423
|
+
resources_utils.format_resource(resources,
|
|
424
|
+
simplified_only=False))
|
|
425
|
+
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
426
|
+
record['resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
427
|
+
except Exception as e: # pylint: disable=broad-except
|
|
428
|
+
logger.debug(f'Failed to get resources_str for cluster '
|
|
429
|
+
f'{record["name"]}: {str(e)}')
|
|
430
|
+
for field in fields:
|
|
431
|
+
record[field] = None
|
|
432
|
+
record['resources_str'] = '-'
|
|
433
|
+
record['resources_str_full'] = '-'
|
|
434
|
+
|
|
435
|
+
for report in cluster_reports:
|
|
436
|
+
_update_record_with_resources(report)
|
|
437
|
+
if dashboard_summary_response:
|
|
438
|
+
report.pop('usage_intervals')
|
|
439
|
+
report.pop('user_hash')
|
|
440
|
+
report.pop('resources')
|
|
326
441
|
|
|
327
442
|
return cluster_reports
|
|
328
443
|
|
|
@@ -330,6 +445,8 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
330
445
|
def _start(
|
|
331
446
|
cluster_name: str,
|
|
332
447
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
448
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
449
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
333
450
|
retry_until_up: bool = False,
|
|
334
451
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
335
452
|
force: bool = False,
|
|
@@ -369,9 +486,44 @@ def _start(
|
|
|
369
486
|
'supported when starting SkyPilot controllers. To '
|
|
370
487
|
f'fix: omit the {arguments_str} to use the '
|
|
371
488
|
f'default autostop settings from config.')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
489
|
+
|
|
490
|
+
# Get the autostop resources, from which we extract the correct autostop
|
|
491
|
+
# config.
|
|
492
|
+
controller_resources = controller_utils.get_controller_resources(
|
|
493
|
+
controller, [])
|
|
494
|
+
# All resources should have the same autostop config.
|
|
495
|
+
controller_autostop_config = list(
|
|
496
|
+
controller_resources)[0].autostop_config
|
|
497
|
+
if (controller_autostop_config is not None and
|
|
498
|
+
controller_autostop_config.enabled):
|
|
499
|
+
idle_minutes_to_autostop = controller_autostop_config.idle_minutes
|
|
500
|
+
down = controller_autostop_config.down
|
|
501
|
+
else:
|
|
502
|
+
# For non-controller clusters, restore autostop configuration from
|
|
503
|
+
# database if not explicitly provided.
|
|
504
|
+
if idle_minutes_to_autostop is None:
|
|
505
|
+
cluster_record = global_user_state.get_cluster_from_name(
|
|
506
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
507
|
+
if cluster_record is not None:
|
|
508
|
+
stored_autostop = cluster_record.get('autostop', -1)
|
|
509
|
+
stored_to_down = cluster_record.get('to_down', False)
|
|
510
|
+
# Restore autostop if it was previously set (autostop > 0)
|
|
511
|
+
if stored_autostop > 0:
|
|
512
|
+
logger.warning(f'Restoring cluster {cluster_name!r} with '
|
|
513
|
+
f'autostop set to {stored_autostop} minutes'
|
|
514
|
+
f'. To turn off autostop, run: '
|
|
515
|
+
f'`sky autostop {cluster_name} --cancel`')
|
|
516
|
+
idle_minutes_to_autostop = stored_autostop
|
|
517
|
+
# Only restore 'down' if it was explicitly set and we're
|
|
518
|
+
# restoring autostop
|
|
519
|
+
if stored_to_down:
|
|
520
|
+
down = stored_to_down
|
|
521
|
+
elif stored_autostop == 0:
|
|
522
|
+
logger.warning(
|
|
523
|
+
f'Autostop was previously set to 0 minutes '
|
|
524
|
+
f'for cluster {cluster_name!r} so it will '
|
|
525
|
+
'not be restored. To turn on autostop, run: '
|
|
526
|
+
f'`sky autostop {cluster_name} -i <minutes>`')
|
|
375
527
|
|
|
376
528
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
377
529
|
|
|
@@ -391,7 +543,7 @@ def _start(
|
|
|
391
543
|
all_file_mounts=None,
|
|
392
544
|
storage_mounts=storage_mounts)
|
|
393
545
|
if idle_minutes_to_autostop is not None:
|
|
394
|
-
backend.set_autostop(handle, idle_minutes_to_autostop, down
|
|
546
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for, down)
|
|
395
547
|
return handle
|
|
396
548
|
|
|
397
549
|
|
|
@@ -399,6 +551,8 @@ def _start(
|
|
|
399
551
|
def start(
|
|
400
552
|
cluster_name: str,
|
|
401
553
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
554
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
555
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
402
556
|
retry_until_up: bool = False,
|
|
403
557
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
404
558
|
force: bool = False,
|
|
@@ -453,6 +607,7 @@ def start(
|
|
|
453
607
|
'`idle_minutes_to_autostop` must be set if `down` is True.')
|
|
454
608
|
return _start(cluster_name,
|
|
455
609
|
idle_minutes_to_autostop,
|
|
610
|
+
wait_for,
|
|
456
611
|
retry_until_up,
|
|
457
612
|
down,
|
|
458
613
|
force=force)
|
|
@@ -463,7 +618,10 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
|
463
618
|
message = ('Stopping spot instances is currently not supported on '
|
|
464
619
|
f'{resources.cloud}')
|
|
465
620
|
else:
|
|
466
|
-
|
|
621
|
+
cloud_name = resources.cloud.display_name(
|
|
622
|
+
) if resources.cloud else resources.cloud
|
|
623
|
+
message = ('Stopping is currently not supported for '
|
|
624
|
+
f'{cloud_name}')
|
|
467
625
|
return message
|
|
468
626
|
|
|
469
627
|
|
|
@@ -539,6 +697,11 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
|
539
697
|
raise exceptions.ClusterDoesNotExist(
|
|
540
698
|
f'Cluster {cluster_name!r} does not exist.')
|
|
541
699
|
|
|
700
|
+
global_user_state.add_cluster_event(
|
|
701
|
+
cluster_name, status_lib.ClusterStatus.STOPPED,
|
|
702
|
+
'Cluster was stopped by user.',
|
|
703
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
704
|
+
|
|
542
705
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
543
706
|
|
|
544
707
|
if isinstance(backend, backends.CloudVmRayBackend):
|
|
@@ -566,6 +729,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
|
566
729
|
def autostop(
|
|
567
730
|
cluster_name: str,
|
|
568
731
|
idle_minutes: int,
|
|
732
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = autostop_lib.
|
|
733
|
+
DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
569
734
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
570
735
|
) -> None:
|
|
571
736
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
@@ -627,29 +792,26 @@ def autostop(
|
|
|
627
792
|
)
|
|
628
793
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
629
794
|
|
|
795
|
+
resources = handle.launched_resources.assert_launchable()
|
|
630
796
|
# Check cloud supports stopping spot instances
|
|
631
|
-
cloud =
|
|
632
|
-
assert cloud is not None, handle
|
|
797
|
+
cloud = resources.cloud
|
|
633
798
|
|
|
634
799
|
if not isinstance(backend, backends.CloudVmRayBackend):
|
|
635
800
|
raise exceptions.NotSupportedError(
|
|
636
801
|
f'{operation} cluster {cluster_name!r} with backend '
|
|
637
802
|
f'{backend.__class__.__name__!r} is not supported.')
|
|
638
|
-
|
|
803
|
+
|
|
639
804
|
# Check if autostop/autodown is required and supported
|
|
640
805
|
if not is_cancel:
|
|
641
806
|
try:
|
|
642
807
|
if down:
|
|
643
808
|
cloud.check_features_are_supported(
|
|
644
|
-
|
|
645
|
-
{clouds.CloudImplementationFeatures.AUTODOWN})
|
|
809
|
+
resources, {clouds.CloudImplementationFeatures.AUTODOWN})
|
|
646
810
|
else:
|
|
647
811
|
cloud.check_features_are_supported(
|
|
648
|
-
|
|
649
|
-
{clouds.CloudImplementationFeatures.STOP})
|
|
812
|
+
resources, {clouds.CloudImplementationFeatures.STOP})
|
|
650
813
|
cloud.check_features_are_supported(
|
|
651
|
-
|
|
652
|
-
{clouds.CloudImplementationFeatures.AUTOSTOP})
|
|
814
|
+
resources, {clouds.CloudImplementationFeatures.AUTOSTOP})
|
|
653
815
|
except exceptions.NotSupportedError as e:
|
|
654
816
|
raise exceptions.NotSupportedError(
|
|
655
817
|
f'{colorama.Fore.YELLOW}{operation} on cluster '
|
|
@@ -658,7 +820,7 @@ def autostop(
|
|
|
658
820
|
f'see reason above.') from e
|
|
659
821
|
|
|
660
822
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
661
|
-
backend.set_autostop(handle, idle_minutes, down)
|
|
823
|
+
backend.set_autostop(handle, idle_minutes, wait_for, down)
|
|
662
824
|
|
|
663
825
|
|
|
664
826
|
# ==================
|
|
@@ -669,7 +831,7 @@ def autostop(
|
|
|
669
831
|
@usage_lib.entrypoint
|
|
670
832
|
def queue(cluster_name: str,
|
|
671
833
|
skip_finished: bool = False,
|
|
672
|
-
all_users: bool = False) -> List[
|
|
834
|
+
all_users: bool = False) -> List[responses.ClusterJobRecord]:
|
|
673
835
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
674
836
|
"""Gets the job queue of a cluster.
|
|
675
837
|
|
|
@@ -703,10 +865,10 @@ def queue(cluster_name: str,
|
|
|
703
865
|
exceptions.CommandError: if failed to get the job queue with ssh.
|
|
704
866
|
"""
|
|
705
867
|
all_jobs = not skip_finished
|
|
706
|
-
user_hash: Optional[str] = common_utils.get_user_hash()
|
|
707
868
|
if all_users:
|
|
708
869
|
user_hash = None
|
|
709
|
-
|
|
870
|
+
else:
|
|
871
|
+
user_hash = common_utils.get_current_user().id
|
|
710
872
|
|
|
711
873
|
handle = backend_utils.check_cluster_available(
|
|
712
874
|
cluster_name,
|
|
@@ -714,18 +876,49 @@ def queue(cluster_name: str,
|
|
|
714
876
|
)
|
|
715
877
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
716
878
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
879
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
880
|
+
|
|
881
|
+
if not use_legacy:
|
|
882
|
+
try:
|
|
883
|
+
request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
|
|
884
|
+
all_jobs=all_jobs)
|
|
885
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
886
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
887
|
+
handle.get_grpc_channel()).get_job_queue(request))
|
|
888
|
+
jobs = []
|
|
889
|
+
for job_info in response.jobs:
|
|
890
|
+
job_dict = {
|
|
891
|
+
'job_id': job_info.job_id,
|
|
892
|
+
'job_name': job_info.job_name,
|
|
893
|
+
'submitted_at': job_info.submitted_at,
|
|
894
|
+
'status': job_lib.JobStatus.from_protobuf(job_info.status),
|
|
895
|
+
'run_timestamp': job_info.run_timestamp,
|
|
896
|
+
'start_at': job_info.start_at
|
|
897
|
+
if job_info.HasField('start_at') else None,
|
|
898
|
+
'end_at': job_info.end_at
|
|
899
|
+
if job_info.HasField('end_at') else None,
|
|
900
|
+
'resources': job_info.resources,
|
|
901
|
+
'log_path': job_info.log_path,
|
|
902
|
+
'user_hash': job_info.username,
|
|
903
|
+
}
|
|
904
|
+
# Copied from job_lib.load_job_queue.
|
|
905
|
+
user = global_user_state.get_user(job_dict['user_hash'])
|
|
906
|
+
job_dict['username'] = user.name if user is not None else None
|
|
907
|
+
jobs.append(job_dict)
|
|
908
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
909
|
+
use_legacy = True
|
|
910
|
+
if use_legacy:
|
|
911
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
912
|
+
returncode, jobs_payload, stderr = backend.run_on_head(
|
|
913
|
+
handle, code, require_outputs=True, separate_stderr=True)
|
|
914
|
+
subprocess_utils.handle_returncode(
|
|
915
|
+
returncode,
|
|
916
|
+
command=code,
|
|
917
|
+
error_msg=f'Failed to get job queue on cluster {cluster_name}.',
|
|
918
|
+
stderr=f'{jobs_payload + stderr}',
|
|
919
|
+
stream_logs=True)
|
|
920
|
+
jobs = job_lib.load_job_queue(jobs_payload)
|
|
921
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
729
922
|
|
|
730
923
|
|
|
731
924
|
@usage_lib.entrypoint
|
|
@@ -795,8 +988,10 @@ def cancel(
|
|
|
795
988
|
f'handle for cluster {cluster_name!r} should not be None')
|
|
796
989
|
|
|
797
990
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
991
|
+
user_hash: Optional[str] = common_utils.get_current_user().id
|
|
798
992
|
|
|
799
993
|
if all_users:
|
|
994
|
+
user_hash = None
|
|
800
995
|
sky_logging.print(
|
|
801
996
|
f'{colorama.Fore.YELLOW}'
|
|
802
997
|
f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
|
|
@@ -821,7 +1016,7 @@ def cancel(
|
|
|
821
1016
|
backend.cancel_jobs(handle,
|
|
822
1017
|
job_ids,
|
|
823
1018
|
cancel_all=all or all_users,
|
|
824
|
-
user_hash=
|
|
1019
|
+
user_hash=user_hash)
|
|
825
1020
|
|
|
826
1021
|
|
|
827
1022
|
@usage_lib.entrypoint
|
|
@@ -859,7 +1054,12 @@ def tail_logs(cluster_name: str,
|
|
|
859
1054
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
860
1055
|
|
|
861
1056
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
862
|
-
|
|
1057
|
+
# Although tail_logs returns an int when require_outputs=False (default),
|
|
1058
|
+
# we need to check returnval as an int to avoid type checking errors.
|
|
1059
|
+
returnval = backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
|
1060
|
+
assert isinstance(returnval,
|
|
1061
|
+
int), (f'returnval must be an int, but got {returnval}')
|
|
1062
|
+
return returnval
|
|
863
1063
|
|
|
864
1064
|
|
|
865
1065
|
@usage_lib.entrypoint
|
|
@@ -958,25 +1158,25 @@ def job_status(cluster_name: str,
|
|
|
958
1158
|
# = Storage Management =
|
|
959
1159
|
# ======================
|
|
960
1160
|
@usage_lib.entrypoint
|
|
961
|
-
def storage_ls() -> List[
|
|
1161
|
+
def storage_ls() -> List[responses.StorageRecord]:
|
|
962
1162
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
963
1163
|
"""Gets the storages.
|
|
964
1164
|
|
|
965
1165
|
Returns:
|
|
966
|
-
[
|
|
967
|
-
{
|
|
968
|
-
'name': str,
|
|
969
|
-
'launched_at': int timestamp of creation,
|
|
970
|
-
'store': List[sky.StoreType],
|
|
971
|
-
'last_use': int timestamp of last use,
|
|
972
|
-
'status': sky.StorageStatus,
|
|
973
|
-
}
|
|
974
|
-
]
|
|
1166
|
+
List[responses.StorageRecord]: A list of storage records.
|
|
975
1167
|
"""
|
|
976
1168
|
storages = global_user_state.get_storage()
|
|
1169
|
+
storage_records = []
|
|
977
1170
|
for storage in storages:
|
|
978
|
-
|
|
979
|
-
|
|
1171
|
+
storage_records.append(
|
|
1172
|
+
responses.StorageRecord(
|
|
1173
|
+
name=storage['name'],
|
|
1174
|
+
launched_at=storage['launched_at'],
|
|
1175
|
+
store=list(storage.pop('handle').sky_stores.keys()),
|
|
1176
|
+
last_use=storage['last_use'],
|
|
1177
|
+
status=storage['status'],
|
|
1178
|
+
))
|
|
1179
|
+
return storage_records
|
|
980
1180
|
|
|
981
1181
|
|
|
982
1182
|
@usage_lib.entrypoint
|
|
@@ -992,9 +1192,7 @@ def storage_delete(name: str) -> None:
|
|
|
992
1192
|
if handle is None:
|
|
993
1193
|
raise ValueError(f'Storage name {name!r} not found.')
|
|
994
1194
|
else:
|
|
995
|
-
storage_object = data.Storage(
|
|
996
|
-
source=handle.source,
|
|
997
|
-
sync_on_reconstruction=False)
|
|
1195
|
+
storage_object = data.Storage.from_handle(handle)
|
|
998
1196
|
storage_object.delete()
|
|
999
1197
|
|
|
1000
1198
|
|
|
@@ -1002,20 +1200,49 @@ def storage_delete(name: str) -> None:
|
|
|
1002
1200
|
# = Catalog Observe =
|
|
1003
1201
|
# ===================
|
|
1004
1202
|
@usage_lib.entrypoint
|
|
1005
|
-
def enabled_clouds(
|
|
1006
|
-
|
|
1007
|
-
|
|
1203
|
+
def enabled_clouds(workspace: Optional[str] = None,
|
|
1204
|
+
expand: bool = False) -> List[str]:
|
|
1205
|
+
if workspace is None:
|
|
1206
|
+
workspace = skypilot_config.get_active_workspace()
|
|
1207
|
+
cached_clouds = global_user_state.get_cached_enabled_clouds(
|
|
1208
|
+
sky_cloud.CloudCapability.COMPUTE, workspace=workspace)
|
|
1209
|
+
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
1210
|
+
if not expand:
|
|
1211
|
+
return [cloud.canonical_name() for cloud in cached_clouds]
|
|
1212
|
+
enabled_ssh_infras = []
|
|
1213
|
+
enabled_k8s_infras = []
|
|
1214
|
+
enabled_cloud_infras = []
|
|
1215
|
+
for cloud in cached_clouds:
|
|
1216
|
+
cloud_infra = cloud.expand_infras()
|
|
1217
|
+
if isinstance(cloud, clouds.SSH):
|
|
1218
|
+
enabled_ssh_infras.extend(cloud_infra)
|
|
1219
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
|
1220
|
+
enabled_k8s_infras.extend(cloud_infra)
|
|
1221
|
+
else:
|
|
1222
|
+
enabled_cloud_infras.extend(cloud_infra)
|
|
1223
|
+
all_infras = sorted(enabled_ssh_infras) + sorted(
|
|
1224
|
+
enabled_k8s_infras) + sorted(enabled_cloud_infras)
|
|
1225
|
+
return all_infras
|
|
1008
1226
|
|
|
1009
1227
|
|
|
1010
1228
|
@usage_lib.entrypoint
|
|
1011
1229
|
def realtime_kubernetes_gpu_availability(
|
|
1012
1230
|
context: Optional[str] = None,
|
|
1013
1231
|
name_filter: Optional[str] = None,
|
|
1014
|
-
quantity_filter: Optional[int] = None
|
|
1232
|
+
quantity_filter: Optional[int] = None,
|
|
1233
|
+
is_ssh: Optional[bool] = None
|
|
1015
1234
|
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
|
1016
1235
|
|
|
1017
1236
|
if context is None:
|
|
1018
|
-
|
|
1237
|
+
# Include contexts from both Kubernetes and SSH clouds
|
|
1238
|
+
kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1239
|
+
ssh_contexts = clouds.SSH.existing_allowed_contexts()
|
|
1240
|
+
if is_ssh is None:
|
|
1241
|
+
context_list = kubernetes_contexts + ssh_contexts
|
|
1242
|
+
elif is_ssh:
|
|
1243
|
+
context_list = ssh_contexts
|
|
1244
|
+
else:
|
|
1245
|
+
context_list = kubernetes_contexts
|
|
1019
1246
|
else:
|
|
1020
1247
|
context_list = [context]
|
|
1021
1248
|
|
|
@@ -1024,9 +1251,9 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1024
1251
|
name_filter: Optional[str] = None,
|
|
1025
1252
|
quantity_filter: Optional[int] = None
|
|
1026
1253
|
) -> List[models.RealtimeGpuAvailability]:
|
|
1027
|
-
counts, capacity, available =
|
|
1254
|
+
counts, capacity, available = catalog.list_accelerator_realtime(
|
|
1028
1255
|
gpus_only=True,
|
|
1029
|
-
clouds='kubernetes',
|
|
1256
|
+
clouds='ssh' if is_ssh else 'kubernetes',
|
|
1030
1257
|
name_filter=name_filter,
|
|
1031
1258
|
region_filter=context,
|
|
1032
1259
|
quantity_filter=quantity_filter,
|
|
@@ -1058,16 +1285,19 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1058
1285
|
name_filter=name_filter,
|
|
1059
1286
|
quantity_filter=quantity_filter), context_list)
|
|
1060
1287
|
|
|
1288
|
+
cloud_identity = 'ssh' if is_ssh else 'kubernetes'
|
|
1289
|
+
cloud_identity_capital = 'SSH' if is_ssh else 'Kubernetes'
|
|
1290
|
+
|
|
1061
1291
|
for ctx, queried in zip(context_list, parallel_queried):
|
|
1062
1292
|
cumulative_count += len(queried)
|
|
1063
1293
|
if len(queried) == 0:
|
|
1064
1294
|
# don't add gpu results for clusters that don't have any
|
|
1065
|
-
logger.debug(f'No gpus found in
|
|
1295
|
+
logger.debug(f'No gpus found in {cloud_identity} cluster {ctx}')
|
|
1066
1296
|
continue
|
|
1067
1297
|
availability_lists.append((ctx, queried))
|
|
1068
1298
|
|
|
1069
1299
|
if cumulative_count == 0:
|
|
1070
|
-
err_msg = 'No GPUs found in any
|
|
1300
|
+
err_msg = f'No GPUs found in any {cloud_identity_capital} clusters. '
|
|
1071
1301
|
debug_msg = 'To further debug, run: sky check '
|
|
1072
1302
|
if name_filter is not None:
|
|
1073
1303
|
gpu_info_msg = f' {name_filter!r}'
|
|
@@ -1075,9 +1305,9 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1075
1305
|
gpu_info_msg += (' with requested quantity'
|
|
1076
1306
|
f' {quantity_filter}')
|
|
1077
1307
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
|
1078
|
-
'in
|
|
1079
|
-
debug_msg = ('To show available accelerators on
|
|
1080
|
-
' run: sky show-gpus --cloud
|
|
1308
|
+
f'in {cloud_identity_capital} clusters. ')
|
|
1309
|
+
debug_msg = (f'To show available accelerators on {cloud_identity}, '
|
|
1310
|
+
f' run: sky show-gpus --cloud {cloud_identity} ')
|
|
1081
1311
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
|
1082
1312
|
debug_msg)
|
|
1083
1313
|
raise ValueError(full_err_msg)
|
|
@@ -1089,89 +1319,61 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1089
1319
|
# =================
|
|
1090
1320
|
@usage_lib.entrypoint
|
|
1091
1321
|
def local_up(gpus: bool,
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
context_name: Optional[str] = None,
|
|
1097
|
-
password: Optional[str] = None) -> None:
|
|
1098
|
-
"""Creates a local or remote cluster."""
|
|
1099
|
-
|
|
1100
|
-
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
|
1101
|
-
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
|
1102
|
-
# all must be specified
|
|
1103
|
-
if bool(ips) or bool(ssh_user) or bool(ssh_key):
|
|
1104
|
-
if not (ips and ssh_user and ssh_key):
|
|
1105
|
-
with ux_utils.print_exception_no_traceback():
|
|
1106
|
-
raise ValueError(
|
|
1107
|
-
'All ips, ssh_user, and ssh_key must be specified '
|
|
1108
|
-
'together.')
|
|
1109
|
-
|
|
1110
|
-
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
|
1111
|
-
# are all provided
|
|
1112
|
-
if cleanup and not (ips and ssh_user and ssh_key):
|
|
1113
|
-
with ux_utils.print_exception_no_traceback():
|
|
1114
|
-
raise ValueError(
|
|
1115
|
-
'cleanup can only be used with ips, ssh_user and ssh_key.')
|
|
1116
|
-
|
|
1117
|
-
_validate_args(ips, ssh_user, ssh_key, cleanup)
|
|
1118
|
-
|
|
1119
|
-
# If remote deployment arguments are specified, run remote up script
|
|
1120
|
-
if ips:
|
|
1121
|
-
assert ssh_user is not None and ssh_key is not None
|
|
1122
|
-
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
|
|
1123
|
-
cleanup, context_name,
|
|
1124
|
-
password)
|
|
1125
|
-
else:
|
|
1126
|
-
# Run local deployment (kind) if no remote args are specified
|
|
1127
|
-
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
|
1322
|
+
name: Optional[str] = None,
|
|
1323
|
+
port_start: Optional[int] = None) -> None:
|
|
1324
|
+
"""Creates a local cluster."""
|
|
1325
|
+
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
|
|
1128
1326
|
|
|
1129
1327
|
|
|
1130
|
-
def local_down() -> None:
|
|
1328
|
+
def local_down(name: Optional[str] = None) -> None:
|
|
1131
1329
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1132
|
-
|
|
1330
|
+
kubernetes_deploy_utils.teardown_local_cluster(name)
|
|
1133
1331
|
|
|
1134
|
-
path_to_package = os.path.dirname(__file__)
|
|
1135
|
-
down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
|
|
1136
|
-
'delete_cluster.sh')
|
|
1137
1332
|
|
|
1138
|
-
|
|
1139
|
-
|
|
1333
|
+
@usage_lib.entrypoint
|
|
1334
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
|
1335
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
|
1140
1336
|
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1337
|
+
Args:
|
|
1338
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
|
1339
|
+
If None, the first cluster in the file is used.
|
|
1340
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
|
1341
|
+
"""
|
|
1342
|
+
kubernetes_deploy_utils.deploy_ssh_cluster(
|
|
1343
|
+
cleanup=cleanup,
|
|
1344
|
+
infra=infra,
|
|
1345
|
+
)
|
|
1145
1346
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1347
|
+
|
|
1348
|
+
@usage_lib.entrypoint
|
|
1349
|
+
def ssh_status(context_name: str) -> Tuple[bool, str]:
|
|
1350
|
+
"""Check the status of an SSH Node Pool context.
|
|
1351
|
+
|
|
1352
|
+
Args:
|
|
1353
|
+
context_name: The SSH context name (e.g., 'ssh-my-cluster')
|
|
1354
|
+
|
|
1355
|
+
Returns:
|
|
1356
|
+
Tuple[bool, str]: (is_ready, reason)
|
|
1357
|
+
- is_ready: True if the SSH Node Pool is ready, False otherwise
|
|
1358
|
+
- reason: Explanation of the status
|
|
1359
|
+
"""
|
|
1360
|
+
try:
|
|
1361
|
+
is_ready, reason = clouds.SSH.check_single_context(context_name)
|
|
1362
|
+
return is_ready, reason
|
|
1363
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1364
|
+
return False, ('Failed to check SSH context: '
|
|
1365
|
+
f'{common_utils.format_exception(e)}')
|
|
1366
|
+
|
|
1367
|
+
|
|
1368
|
+
def get_all_contexts() -> List[str]:
|
|
1369
|
+
"""Get all available contexts from Kubernetes and SSH clouds.
|
|
1370
|
+
|
|
1371
|
+
Returns:
|
|
1372
|
+
List[str]: A list of all available context names.
|
|
1373
|
+
"""
|
|
1374
|
+
kube_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1375
|
+
ssh_contexts = clouds.SSH.get_ssh_node_pool_contexts()
|
|
1376
|
+
# Ensure ssh_contexts are prefixed appropriately if not already
|
|
1377
|
+
# For now, assuming get_ssh_node_pool_contexts already returns them
|
|
1378
|
+
# in the desired format (e.g., 'ssh-my-cluster')
|
|
1379
|
+
return sorted(list(set(kube_contexts + ssh_contexts)))
|