skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/vast.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
""" Vast Cloud. """
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import typing
|
|
4
5
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
6
|
|
|
6
7
|
from sky import catalog
|
|
7
8
|
from sky import clouds
|
|
9
|
+
from sky.adaptors import common
|
|
8
10
|
from sky.utils import registry
|
|
9
11
|
from sky.utils import resources_utils
|
|
10
12
|
|
|
@@ -12,6 +14,8 @@ if typing.TYPE_CHECKING:
|
|
|
12
14
|
from sky import resources as resources_lib
|
|
13
15
|
from sky.utils import volume as volume_lib
|
|
14
16
|
|
|
17
|
+
_CREDENTIAL_PATH = '~/.config/vastai/vast_api_key'
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
@registry.CLOUD_REGISTRY.register
|
|
17
21
|
class Vast(clouds.Cloud):
|
|
@@ -51,7 +55,9 @@ class Vast(clouds.Cloud):
|
|
|
51
55
|
|
|
52
56
|
@classmethod
|
|
53
57
|
def _unsupported_features_for_resources(
|
|
54
|
-
cls,
|
|
58
|
+
cls,
|
|
59
|
+
resources: 'resources_lib.Resources',
|
|
60
|
+
region: Optional[str] = None,
|
|
55
61
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
56
62
|
"""The features not supported based on the resources provided.
|
|
57
63
|
|
|
@@ -70,10 +76,15 @@ class Vast(clouds.Cloud):
|
|
|
70
76
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
71
77
|
|
|
72
78
|
@classmethod
|
|
73
|
-
def regions_with_offering(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
79
|
+
def regions_with_offering(
|
|
80
|
+
cls,
|
|
81
|
+
instance_type: str,
|
|
82
|
+
accelerators: Optional[Dict[str, int]],
|
|
83
|
+
use_spot: bool,
|
|
84
|
+
region: Optional[str],
|
|
85
|
+
zone: Optional[str],
|
|
86
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
87
|
+
) -> List[clouds.Region]:
|
|
77
88
|
assert zone is None, 'Vast does not support zones.'
|
|
78
89
|
del accelerators, zone # unused
|
|
79
90
|
regions = catalog.get_region_zones_for_instance_type(
|
|
@@ -253,32 +264,27 @@ class Vast(clouds.Cloud):
|
|
|
253
264
|
def _check_compute_credentials(
|
|
254
265
|
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
255
266
|
"""Checks if the user has valid credentials for
|
|
256
|
-
Vast's compute service.
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
if vast.creds_source != 'FILE':
|
|
263
|
-
return False, (
|
|
264
|
-
'error \n' # First line is indented by 4 spaces
|
|
265
|
-
' Credentials can be set up by running: \n'
|
|
266
|
-
' $ pip install vastai\n'
|
|
267
|
-
' $ mkdir -p ~/.config/vastai\n'
|
|
268
|
-
' $ echo [key] > ~/.config/vastai/vast_api_key\n'
|
|
269
|
-
' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
|
|
270
|
-
)
|
|
267
|
+
Vast's compute service."""
|
|
268
|
+
|
|
269
|
+
dependency_error_msg = ('Failed to import vast. '
|
|
270
|
+
'To install, run: pip install skypilot[vast]')
|
|
271
|
+
if not common.can_import_modules(['vastai_sdk']):
|
|
272
|
+
return False, dependency_error_msg
|
|
271
273
|
|
|
272
|
-
|
|
274
|
+
if not os.path.exists(os.path.expanduser(_CREDENTIAL_PATH)):
|
|
275
|
+
return False, (
|
|
276
|
+
'error \n' # First line is indented by 4 spaces
|
|
277
|
+
' Credentials can be set up by running: \n'
|
|
278
|
+
' $ pip install vastai\n'
|
|
279
|
+
' $ mkdir -p ~/.config/vastai\n'
|
|
280
|
+
f' $ echo [key] > {_CREDENTIAL_PATH}\n'
|
|
281
|
+
' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
|
|
282
|
+
)
|
|
273
283
|
|
|
274
|
-
|
|
275
|
-
return False, ('Failed to import vast. '
|
|
276
|
-
'To install, run: pip install skypilot[vast]')
|
|
284
|
+
return True, None
|
|
277
285
|
|
|
278
286
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
279
|
-
return {
|
|
280
|
-
'~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
|
|
281
|
-
}
|
|
287
|
+
return {f'{_CREDENTIAL_PATH}': f'{_CREDENTIAL_PATH}'}
|
|
282
288
|
|
|
283
289
|
@classmethod
|
|
284
290
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
sky/clouds/vsphere.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
"""Vsphere cloud implementation."""
|
|
2
|
-
import subprocess
|
|
3
2
|
import typing
|
|
4
3
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
4
|
|
|
@@ -9,7 +8,6 @@ from sky.adaptors import common as adaptors_common
|
|
|
9
8
|
from sky.provision.vsphere import vsphere_utils
|
|
10
9
|
from sky.provision.vsphere.vsphere_utils import get_vsphere_credentials
|
|
11
10
|
from sky.provision.vsphere.vsphere_utils import initialize_vsphere_data
|
|
12
|
-
from sky.utils import common_utils
|
|
13
11
|
from sky.utils import registry
|
|
14
12
|
from sky.utils import resources_utils
|
|
15
13
|
|
|
@@ -75,7 +73,9 @@ class Vsphere(clouds.Cloud):
|
|
|
75
73
|
|
|
76
74
|
@classmethod
|
|
77
75
|
def _unsupported_features_for_resources(
|
|
78
|
-
cls,
|
|
76
|
+
cls,
|
|
77
|
+
resources: 'resources_lib.Resources',
|
|
78
|
+
region: Optional[str] = None,
|
|
79
79
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
80
80
|
features = cls._CLOUD_UNSUPPORTED_FEATURES
|
|
81
81
|
return features
|
|
@@ -92,6 +92,7 @@ class Vsphere(clouds.Cloud):
|
|
|
92
92
|
use_spot: bool,
|
|
93
93
|
region: Optional[str],
|
|
94
94
|
zone: Optional[str],
|
|
95
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
95
96
|
) -> List[clouds.Region]:
|
|
96
97
|
del accelerators, zone # unused
|
|
97
98
|
regions = catalog.get_region_zones_for_instance_type(
|
|
@@ -278,19 +279,16 @@ class Vsphere(clouds.Cloud):
|
|
|
278
279
|
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
279
280
|
"""Checks if the user has access credentials to
|
|
280
281
|
vSphere's compute service."""
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. '
|
|
292
|
-
'For more details. See https://docs.skypilot.co/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long
|
|
293
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
282
|
+
dependency_error_msg = (
|
|
283
|
+
'vSphere dependencies are not installed. '
|
|
284
|
+
'Run the following commands:'
|
|
285
|
+
f'\n{cls._INDENT_PREFIX} $ pip install skypilot[vSphere]'
|
|
286
|
+
f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. '
|
|
287
|
+
'For more details. See https://docs.skypilot.co/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long
|
|
288
|
+
)
|
|
289
|
+
# Check pyVmomi installation.
|
|
290
|
+
if not adaptors_common.can_import_modules(['pyVmomi']):
|
|
291
|
+
return False, dependency_error_msg
|
|
294
292
|
|
|
295
293
|
required_keys = ['name', 'username', 'password', 'clusters']
|
|
296
294
|
skip_key = 'skip_verification'
|
sky/core.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
2
|
import typing
|
|
5
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
4
|
|
|
@@ -9,7 +7,6 @@ import colorama
|
|
|
9
7
|
from sky import admin_policy
|
|
10
8
|
from sky import backends
|
|
11
9
|
from sky import catalog
|
|
12
|
-
from sky import check as sky_check
|
|
13
10
|
from sky import clouds
|
|
14
11
|
from sky import dag as dag_lib
|
|
15
12
|
from sky import data
|
|
@@ -20,16 +17,18 @@ from sky import optimizer
|
|
|
20
17
|
from sky import sky_logging
|
|
21
18
|
from sky import skypilot_config
|
|
22
19
|
from sky import task as task_lib
|
|
20
|
+
from sky.adaptors import common as adaptors_common
|
|
23
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.backends import cloud_vm_ray_backend
|
|
24
23
|
from sky.clouds import cloud as sky_cloud
|
|
25
24
|
from sky.jobs.server import core as managed_jobs_core
|
|
26
25
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
26
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
28
27
|
from sky.schemas.api import responses
|
|
28
|
+
from sky.server.requests import request_names
|
|
29
29
|
from sky.skylet import autostop_lib
|
|
30
30
|
from sky.skylet import constants
|
|
31
31
|
from sky.skylet import job_lib
|
|
32
|
-
from sky.skylet import log_lib
|
|
33
32
|
from sky.usage import usage_lib
|
|
34
33
|
from sky.utils import admin_policy_utils
|
|
35
34
|
from sky.utils import common
|
|
@@ -44,6 +43,9 @@ from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
|
44
43
|
|
|
45
44
|
if typing.TYPE_CHECKING:
|
|
46
45
|
from sky import resources as resources_lib
|
|
46
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
47
|
+
else:
|
|
48
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
47
49
|
|
|
48
50
|
logger = sky_logging.init_logger(__name__)
|
|
49
51
|
|
|
@@ -83,7 +85,9 @@ def optimize(
|
|
|
83
85
|
# but we do not apply the admin policy there. We should apply the admin
|
|
84
86
|
# policy in the optimizer, but that will require some refactoring.
|
|
85
87
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
86
|
-
dag,
|
|
88
|
+
dag,
|
|
89
|
+
request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
|
|
90
|
+
request_options=request_options) as dag:
|
|
87
91
|
dag.resolve_and_validate_volumes()
|
|
88
92
|
return optimizer.Optimizer.optimize(dag=dag,
|
|
89
93
|
minimize=minimize,
|
|
@@ -97,6 +101,8 @@ def status(
|
|
|
97
101
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
98
102
|
all_users: bool = False,
|
|
99
103
|
include_credentials: bool = False,
|
|
104
|
+
summary_response: bool = False,
|
|
105
|
+
include_handle: bool = True,
|
|
100
106
|
) -> List[responses.StatusResponse]:
|
|
101
107
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
102
108
|
"""Gets cluster statuses.
|
|
@@ -176,16 +182,25 @@ def status(
|
|
|
176
182
|
refresh=refresh,
|
|
177
183
|
cluster_names=cluster_names,
|
|
178
184
|
all_users=all_users,
|
|
179
|
-
include_credentials=include_credentials
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
185
|
+
include_credentials=include_credentials,
|
|
186
|
+
summary_response=summary_response,
|
|
187
|
+
include_handle=include_handle)
|
|
188
|
+
|
|
189
|
+
status_responses = []
|
|
190
|
+
for cluster in clusters:
|
|
191
|
+
try:
|
|
192
|
+
status_responses.append(
|
|
193
|
+
responses.StatusResponse.model_validate(cluster))
|
|
194
|
+
except Exception as e: # pylint: disable=broad-except
|
|
195
|
+
logger.warning('Failed to validate status responses for cluster '
|
|
196
|
+
f'{cluster.get("name")}: {e}')
|
|
197
|
+
return status_responses
|
|
183
198
|
|
|
184
199
|
|
|
185
200
|
def status_kubernetes(
|
|
186
201
|
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
187
202
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
188
|
-
List[
|
|
203
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
189
204
|
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
|
190
205
|
|
|
191
206
|
Managed jobs and services are also included in the clusters returned.
|
|
@@ -260,6 +275,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
|
260
275
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
|
261
276
|
for c in unmanaged_clusters
|
|
262
277
|
]
|
|
278
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
263
279
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
264
280
|
|
|
265
281
|
|
|
@@ -288,7 +304,10 @@ def endpoints(cluster: str,
|
|
|
288
304
|
|
|
289
305
|
|
|
290
306
|
@usage_lib.entrypoint
|
|
291
|
-
def cost_report(
|
|
307
|
+
def cost_report(
|
|
308
|
+
days: Optional[int] = None,
|
|
309
|
+
dashboard_summary_response: bool = False,
|
|
310
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
292
311
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
293
312
|
"""Get all cluster cost reports, including those that have been downed.
|
|
294
313
|
|
|
@@ -334,7 +353,12 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
334
353
|
if days is None:
|
|
335
354
|
days = constants.COST_REPORT_DEFAULT_DAYS
|
|
336
355
|
|
|
337
|
-
|
|
356
|
+
abbreviate_response = dashboard_summary_response and cluster_hashes is None
|
|
357
|
+
|
|
358
|
+
cluster_reports = global_user_state.get_clusters_from_history(
|
|
359
|
+
days=days,
|
|
360
|
+
abbreviate_response=abbreviate_response,
|
|
361
|
+
cluster_hashes=cluster_hashes)
|
|
338
362
|
logger.debug(
|
|
339
363
|
f'{len(cluster_reports)} clusters found from history with {days} days.')
|
|
340
364
|
|
|
@@ -352,43 +376,6 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
352
376
|
cost = (launched_resources.get_cost(duration) * launched_nodes)
|
|
353
377
|
return cost
|
|
354
378
|
|
|
355
|
-
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
356
|
-
"""Add resource fields for dashboard compatibility."""
|
|
357
|
-
if record is None:
|
|
358
|
-
return
|
|
359
|
-
resources = record.get('resources')
|
|
360
|
-
if resources is None:
|
|
361
|
-
return
|
|
362
|
-
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
363
|
-
for field in fields:
|
|
364
|
-
try:
|
|
365
|
-
record[field] = str(getattr(resources, field))
|
|
366
|
-
except Exception as e: # pylint: disable=broad-except
|
|
367
|
-
# Ok to skip the fields as this is just for display
|
|
368
|
-
# purposes.
|
|
369
|
-
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
370
|
-
f'{record["name"]}: {str(e)}')
|
|
371
|
-
record[field] = None
|
|
372
|
-
|
|
373
|
-
# Add resources_str and resources_str_full for dashboard
|
|
374
|
-
# compatibility
|
|
375
|
-
num_nodes = record.get('num_nodes', 1)
|
|
376
|
-
try:
|
|
377
|
-
resource_str_simple = resources_utils.format_resource(
|
|
378
|
-
resources, simplify=True)
|
|
379
|
-
resource_str_full = resources_utils.format_resource(
|
|
380
|
-
resources, simplify=False)
|
|
381
|
-
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
382
|
-
record[
|
|
383
|
-
'resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
384
|
-
except Exception as e: # pylint: disable=broad-except
|
|
385
|
-
logger.debug(f'Failed to get resources_str for cluster '
|
|
386
|
-
f'{record["name"]}: {str(e)}')
|
|
387
|
-
for field in fields:
|
|
388
|
-
record[field] = None
|
|
389
|
-
record['resources_str'] = '-'
|
|
390
|
-
record['resources_str_full'] = '-'
|
|
391
|
-
|
|
392
379
|
try:
|
|
393
380
|
report['total_cost'] = get_total_cost(report)
|
|
394
381
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -397,17 +384,62 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
397
384
|
f'{report["name"]}: {str(e)}')
|
|
398
385
|
report['total_cost'] = 0.0
|
|
399
386
|
|
|
400
|
-
_update_record_with_resources(report)
|
|
401
387
|
return report
|
|
402
388
|
|
|
403
389
|
# Process clusters in parallel
|
|
404
390
|
if not cluster_reports:
|
|
405
391
|
return []
|
|
406
392
|
|
|
407
|
-
|
|
408
|
-
|
|
393
|
+
if not abbreviate_response:
|
|
394
|
+
cluster_reports = subprocess_utils.run_in_parallel(
|
|
395
|
+
_process_cluster_report, cluster_reports)
|
|
396
|
+
|
|
397
|
+
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
398
|
+
"""Add resource fields for dashboard compatibility."""
|
|
399
|
+
if record is None:
|
|
400
|
+
return
|
|
401
|
+
resources = record.get('resources')
|
|
402
|
+
if resources is None:
|
|
403
|
+
return
|
|
404
|
+
if not dashboard_summary_response:
|
|
405
|
+
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
406
|
+
else:
|
|
407
|
+
fields = ['cloud']
|
|
408
|
+
for field in fields:
|
|
409
|
+
try:
|
|
410
|
+
record[field] = str(getattr(resources, field))
|
|
411
|
+
except Exception as e: # pylint: disable=broad-except
|
|
412
|
+
# Ok to skip the fields as this is just for display
|
|
413
|
+
# purposes.
|
|
414
|
+
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
415
|
+
f'{record["name"]}: {str(e)}')
|
|
416
|
+
record[field] = None
|
|
409
417
|
|
|
410
|
-
|
|
418
|
+
# Add resources_str and resources_str_full for dashboard
|
|
419
|
+
# compatibility
|
|
420
|
+
num_nodes = record.get('num_nodes', 1)
|
|
421
|
+
try:
|
|
422
|
+
resource_str_simple, resource_str_full = (
|
|
423
|
+
resources_utils.format_resource(resources,
|
|
424
|
+
simplified_only=False))
|
|
425
|
+
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
426
|
+
record['resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
427
|
+
except Exception as e: # pylint: disable=broad-except
|
|
428
|
+
logger.debug(f'Failed to get resources_str for cluster '
|
|
429
|
+
f'{record["name"]}: {str(e)}')
|
|
430
|
+
for field in fields:
|
|
431
|
+
record[field] = None
|
|
432
|
+
record['resources_str'] = '-'
|
|
433
|
+
record['resources_str_full'] = '-'
|
|
434
|
+
|
|
435
|
+
for report in cluster_reports:
|
|
436
|
+
_update_record_with_resources(report)
|
|
437
|
+
if dashboard_summary_response:
|
|
438
|
+
report.pop('usage_intervals')
|
|
439
|
+
report.pop('user_hash')
|
|
440
|
+
report.pop('resources')
|
|
441
|
+
|
|
442
|
+
return cluster_reports
|
|
411
443
|
|
|
412
444
|
|
|
413
445
|
def _start(
|
|
@@ -466,6 +498,32 @@ def _start(
|
|
|
466
498
|
controller_autostop_config.enabled):
|
|
467
499
|
idle_minutes_to_autostop = controller_autostop_config.idle_minutes
|
|
468
500
|
down = controller_autostop_config.down
|
|
501
|
+
else:
|
|
502
|
+
# For non-controller clusters, restore autostop configuration from
|
|
503
|
+
# database if not explicitly provided.
|
|
504
|
+
if idle_minutes_to_autostop is None:
|
|
505
|
+
cluster_record = global_user_state.get_cluster_from_name(
|
|
506
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
507
|
+
if cluster_record is not None:
|
|
508
|
+
stored_autostop = cluster_record.get('autostop', -1)
|
|
509
|
+
stored_to_down = cluster_record.get('to_down', False)
|
|
510
|
+
# Restore autostop if it was previously set (autostop > 0)
|
|
511
|
+
if stored_autostop > 0:
|
|
512
|
+
logger.warning(f'Restoring cluster {cluster_name!r} with '
|
|
513
|
+
f'autostop set to {stored_autostop} minutes'
|
|
514
|
+
f'. To turn off autostop, run: '
|
|
515
|
+
f'`sky autostop {cluster_name} --cancel`')
|
|
516
|
+
idle_minutes_to_autostop = stored_autostop
|
|
517
|
+
# Only restore 'down' if it was explicitly set and we're
|
|
518
|
+
# restoring autostop
|
|
519
|
+
if stored_to_down:
|
|
520
|
+
down = stored_to_down
|
|
521
|
+
elif stored_autostop == 0:
|
|
522
|
+
logger.warning(
|
|
523
|
+
f'Autostop was previously set to 0 minutes '
|
|
524
|
+
f'for cluster {cluster_name!r} so it will '
|
|
525
|
+
'not be restored. To turn on autostop, run: '
|
|
526
|
+
f'`sky autostop {cluster_name} -i <minutes>`')
|
|
469
527
|
|
|
470
528
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
471
529
|
|
|
@@ -773,7 +831,7 @@ def autostop(
|
|
|
773
831
|
@usage_lib.entrypoint
|
|
774
832
|
def queue(cluster_name: str,
|
|
775
833
|
skip_finished: bool = False,
|
|
776
|
-
all_users: bool = False) -> List[
|
|
834
|
+
all_users: bool = False) -> List[responses.ClusterJobRecord]:
|
|
777
835
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
778
836
|
"""Gets the job queue of a cluster.
|
|
779
837
|
|
|
@@ -811,7 +869,6 @@ def queue(cluster_name: str,
|
|
|
811
869
|
user_hash = None
|
|
812
870
|
else:
|
|
813
871
|
user_hash = common_utils.get_current_user().id
|
|
814
|
-
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
815
872
|
|
|
816
873
|
handle = backend_utils.check_cluster_available(
|
|
817
874
|
cluster_name,
|
|
@@ -819,18 +876,49 @@ def queue(cluster_name: str,
|
|
|
819
876
|
)
|
|
820
877
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
821
878
|
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
879
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
880
|
+
|
|
881
|
+
if not use_legacy:
|
|
882
|
+
try:
|
|
883
|
+
request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
|
|
884
|
+
all_jobs=all_jobs)
|
|
885
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
886
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
887
|
+
handle.get_grpc_channel()).get_job_queue(request))
|
|
888
|
+
jobs = []
|
|
889
|
+
for job_info in response.jobs:
|
|
890
|
+
job_dict = {
|
|
891
|
+
'job_id': job_info.job_id,
|
|
892
|
+
'job_name': job_info.job_name,
|
|
893
|
+
'submitted_at': job_info.submitted_at,
|
|
894
|
+
'status': job_lib.JobStatus.from_protobuf(job_info.status),
|
|
895
|
+
'run_timestamp': job_info.run_timestamp,
|
|
896
|
+
'start_at': job_info.start_at
|
|
897
|
+
if job_info.HasField('start_at') else None,
|
|
898
|
+
'end_at': job_info.end_at
|
|
899
|
+
if job_info.HasField('end_at') else None,
|
|
900
|
+
'resources': job_info.resources,
|
|
901
|
+
'log_path': job_info.log_path,
|
|
902
|
+
'user_hash': job_info.username,
|
|
903
|
+
}
|
|
904
|
+
# Copied from job_lib.load_job_queue.
|
|
905
|
+
user = global_user_state.get_user(job_dict['user_hash'])
|
|
906
|
+
job_dict['username'] = user.name if user is not None else None
|
|
907
|
+
jobs.append(job_dict)
|
|
908
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
909
|
+
use_legacy = True
|
|
910
|
+
if use_legacy:
|
|
911
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
912
|
+
returncode, jobs_payload, stderr = backend.run_on_head(
|
|
913
|
+
handle, code, require_outputs=True, separate_stderr=True)
|
|
914
|
+
subprocess_utils.handle_returncode(
|
|
915
|
+
returncode,
|
|
916
|
+
command=code,
|
|
917
|
+
error_msg=f'Failed to get job queue on cluster {cluster_name}.',
|
|
918
|
+
stderr=f'{jobs_payload + stderr}',
|
|
919
|
+
stream_logs=True)
|
|
920
|
+
jobs = job_lib.load_job_queue(jobs_payload)
|
|
921
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
834
922
|
|
|
835
923
|
|
|
836
924
|
@usage_lib.entrypoint
|
|
@@ -1070,25 +1158,25 @@ def job_status(cluster_name: str,
|
|
|
1070
1158
|
# = Storage Management =
|
|
1071
1159
|
# ======================
|
|
1072
1160
|
@usage_lib.entrypoint
|
|
1073
|
-
def storage_ls() -> List[
|
|
1161
|
+
def storage_ls() -> List[responses.StorageRecord]:
|
|
1074
1162
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1075
1163
|
"""Gets the storages.
|
|
1076
1164
|
|
|
1077
1165
|
Returns:
|
|
1078
|
-
[
|
|
1079
|
-
{
|
|
1080
|
-
'name': str,
|
|
1081
|
-
'launched_at': int timestamp of creation,
|
|
1082
|
-
'store': List[sky.StoreType],
|
|
1083
|
-
'last_use': int timestamp of last use,
|
|
1084
|
-
'status': sky.StorageStatus,
|
|
1085
|
-
}
|
|
1086
|
-
]
|
|
1166
|
+
List[responses.StorageRecord]: A list of storage records.
|
|
1087
1167
|
"""
|
|
1088
1168
|
storages = global_user_state.get_storage()
|
|
1169
|
+
storage_records = []
|
|
1089
1170
|
for storage in storages:
|
|
1090
|
-
|
|
1091
|
-
|
|
1171
|
+
storage_records.append(
|
|
1172
|
+
responses.StorageRecord(
|
|
1173
|
+
name=storage['name'],
|
|
1174
|
+
launched_at=storage['launched_at'],
|
|
1175
|
+
store=list(storage.pop('handle').sky_stores.keys()),
|
|
1176
|
+
last_use=storage['last_use'],
|
|
1177
|
+
status=storage['status'],
|
|
1178
|
+
))
|
|
1179
|
+
return storage_records
|
|
1092
1180
|
|
|
1093
1181
|
|
|
1094
1182
|
@usage_lib.entrypoint
|
|
@@ -1104,9 +1192,7 @@ def storage_delete(name: str) -> None:
|
|
|
1104
1192
|
if handle is None:
|
|
1105
1193
|
raise ValueError(f'Storage name {name!r} not found.')
|
|
1106
1194
|
else:
|
|
1107
|
-
storage_object = data.Storage(
|
|
1108
|
-
source=handle.source,
|
|
1109
|
-
sync_on_reconstruction=False)
|
|
1195
|
+
storage_object = data.Storage.from_handle(handle)
|
|
1110
1196
|
storage_object.delete()
|
|
1111
1197
|
|
|
1112
1198
|
|
|
@@ -1233,92 +1319,15 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1233
1319
|
# =================
|
|
1234
1320
|
@usage_lib.entrypoint
|
|
1235
1321
|
def local_up(gpus: bool,
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
context_name: Optional[str] = None,
|
|
1241
|
-
password: Optional[str] = None) -> None:
|
|
1242
|
-
"""Creates a local or remote cluster."""
|
|
1243
|
-
|
|
1244
|
-
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
|
1245
|
-
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
|
1246
|
-
# all must be specified
|
|
1247
|
-
if bool(ips) or bool(ssh_user) or bool(ssh_key):
|
|
1248
|
-
if not (ips and ssh_user and ssh_key):
|
|
1249
|
-
with ux_utils.print_exception_no_traceback():
|
|
1250
|
-
raise ValueError(
|
|
1251
|
-
'All ips, ssh_user, and ssh_key must be specified '
|
|
1252
|
-
'together.')
|
|
1253
|
-
|
|
1254
|
-
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
|
1255
|
-
# are all provided
|
|
1256
|
-
if cleanup and not (ips and ssh_user and ssh_key):
|
|
1257
|
-
with ux_utils.print_exception_no_traceback():
|
|
1258
|
-
raise ValueError(
|
|
1259
|
-
'cleanup can only be used with ips, ssh_user and ssh_key.')
|
|
1260
|
-
|
|
1261
|
-
_validate_args(ips, ssh_user, ssh_key, cleanup)
|
|
1262
|
-
|
|
1263
|
-
# If remote deployment arguments are specified, run remote up script
|
|
1264
|
-
if ips:
|
|
1265
|
-
assert ssh_user is not None and ssh_key is not None
|
|
1266
|
-
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
|
|
1267
|
-
cleanup, context_name,
|
|
1268
|
-
password)
|
|
1269
|
-
else:
|
|
1270
|
-
# Run local deployment (kind) if no remote args are specified
|
|
1271
|
-
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
|
1322
|
+
name: Optional[str] = None,
|
|
1323
|
+
port_start: Optional[int] = None) -> None:
|
|
1324
|
+
"""Creates a local cluster."""
|
|
1325
|
+
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
|
|
1272
1326
|
|
|
1273
1327
|
|
|
1274
|
-
def local_down() -> None:
|
|
1328
|
+
def local_down(name: Optional[str] = None) -> None:
|
|
1275
1329
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
path_to_package = os.path.dirname(__file__)
|
|
1279
|
-
down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
|
|
1280
|
-
'delete_cluster.sh')
|
|
1281
|
-
|
|
1282
|
-
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
1283
|
-
run_command = shlex.split(down_script_path)
|
|
1284
|
-
|
|
1285
|
-
# Setup logging paths
|
|
1286
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
|
1287
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
1288
|
-
'local_down.log')
|
|
1289
|
-
|
|
1290
|
-
with rich_utils.safe_status(
|
|
1291
|
-
ux_utils.spinner_message('Removing local cluster',
|
|
1292
|
-
log_path=log_path,
|
|
1293
|
-
is_local=True)):
|
|
1294
|
-
|
|
1295
|
-
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
1296
|
-
log_path=log_path,
|
|
1297
|
-
require_outputs=True,
|
|
1298
|
-
stream_logs=False,
|
|
1299
|
-
cwd=cwd)
|
|
1300
|
-
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
1301
|
-
|
|
1302
|
-
if returncode == 0:
|
|
1303
|
-
cluster_removed = True
|
|
1304
|
-
elif returncode == 100:
|
|
1305
|
-
logger.info(ux_utils.error_message('Local cluster does not exist.'))
|
|
1306
|
-
else:
|
|
1307
|
-
with ux_utils.print_exception_no_traceback():
|
|
1308
|
-
raise RuntimeError('Failed to create local cluster. '
|
|
1309
|
-
f'Stdout: {stdout}'
|
|
1310
|
-
f'\nError: {stderr}')
|
|
1311
|
-
if cluster_removed:
|
|
1312
|
-
# Run sky check
|
|
1313
|
-
with rich_utils.safe_status(
|
|
1314
|
-
ux_utils.spinner_message('Running sky check...')):
|
|
1315
|
-
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
1316
|
-
clouds=['kubernetes'],
|
|
1317
|
-
quiet=True)
|
|
1318
|
-
logger.info(
|
|
1319
|
-
ux_utils.finishing_message('Local cluster removed.',
|
|
1320
|
-
log_path=log_path,
|
|
1321
|
-
is_local=True))
|
|
1330
|
+
kubernetes_deploy_utils.teardown_local_cluster(name)
|
|
1322
1331
|
|
|
1323
1332
|
|
|
1324
1333
|
@usage_lib.entrypoint
|
sky/dashboard/out/404.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-64e05f17bf2cf8ce.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/96_E2yl3QAiIJGOYCkSpB/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"96_E2yl3QAiIJGOYCkSpB","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|