skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/provision/volume.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Volume functions for provisioning and deleting ephemeral volumes."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
from sky import clouds
|
|
7
|
+
from sky import global_user_state
|
|
8
|
+
from sky import models
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.provision import common as provision_common
|
|
11
|
+
from sky.provision import constants as provision_constants
|
|
12
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import volume as volume_utils
|
|
14
|
+
from sky.volumes import volume as volume_lib
|
|
15
|
+
from sky.volumes.server import core as volume_server_core
|
|
16
|
+
|
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _resolve_volume_type(cloud: clouds.Cloud,
|
|
21
|
+
volume_type: Optional[str]) -> str:
|
|
22
|
+
if not volume_type:
|
|
23
|
+
volume_types = None
|
|
24
|
+
for cloud_key, vol_types in volume_lib.CLOUD_TO_VOLUME_TYPE.items():
|
|
25
|
+
if cloud.is_same_cloud(cloud_key):
|
|
26
|
+
volume_types = vol_types
|
|
27
|
+
break
|
|
28
|
+
if volume_types is None:
|
|
29
|
+
raise ValueError(f'No default volume type found for cloud {cloud}')
|
|
30
|
+
if len(volume_types) != 1:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f'Found multiple volume types for cloud {cloud}: {volume_types}'
|
|
33
|
+
)
|
|
34
|
+
return volume_types[0].value
|
|
35
|
+
supported_volume_types = [
|
|
36
|
+
volume_type.value for volume_type in volume_utils.VolumeType
|
|
37
|
+
]
|
|
38
|
+
volume_type = volume_type.lower()
|
|
39
|
+
if volume_type not in supported_volume_types:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f'Invalid volume type: {volume_type} for cloud {cloud}')
|
|
42
|
+
return volume_type
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _resolve_pvc_volume_config(cloud: clouds.Cloud,
|
|
46
|
+
config: provision_common.ProvisionConfig,
|
|
47
|
+
volume_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
provider_config = config.provider_config
|
|
49
|
+
if not cloud.is_same_cloud(clouds.Kubernetes()):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f'PVC volume type is only supported on Kubernetes not on {cloud}')
|
|
52
|
+
supported_access_modes = [
|
|
53
|
+
access_mode.value for access_mode in volume_utils.VolumeAccessMode
|
|
54
|
+
]
|
|
55
|
+
access_mode = volume_config.get('access_mode')
|
|
56
|
+
if access_mode is None:
|
|
57
|
+
access_mode = volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value
|
|
58
|
+
volume_config['access_mode'] = access_mode
|
|
59
|
+
elif access_mode not in supported_access_modes:
|
|
60
|
+
raise ValueError(f'Invalid access mode: {access_mode} for PVC')
|
|
61
|
+
if (access_mode == volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value and
|
|
62
|
+
config.count > 1):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
'Access mode ReadWriteOnce is not supported for multi-node'
|
|
65
|
+
' clusters.')
|
|
66
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
67
|
+
volume_config['namespace'] = namespace
|
|
68
|
+
return volume_config
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _create_ephemeral_volume(
|
|
72
|
+
cloud: clouds.Cloud, region: str, cluster_name_on_cloud: str,
|
|
73
|
+
config: provision_common.ProvisionConfig,
|
|
74
|
+
volume_mount: volume_utils.VolumeMount
|
|
75
|
+
) -> Optional[volume_utils.VolumeInfo]:
|
|
76
|
+
provider_name = repr(cloud)
|
|
77
|
+
path = volume_mount.path
|
|
78
|
+
volume_config = volume_mount.volume_config
|
|
79
|
+
volume_type = _resolve_volume_type(cloud, volume_config.type)
|
|
80
|
+
labels = volume_config.labels
|
|
81
|
+
if volume_type == volume_utils.VolumeType.PVC.value:
|
|
82
|
+
internal_volume_config = _resolve_pvc_volume_config(
|
|
83
|
+
cloud, config, volume_config.config)
|
|
84
|
+
if labels:
|
|
85
|
+
for key, value in labels.items():
|
|
86
|
+
valid, err_msg = cloud.is_label_valid(key, value)
|
|
87
|
+
if not valid:
|
|
88
|
+
raise ValueError(f'{err_msg}')
|
|
89
|
+
else:
|
|
90
|
+
labels = {}
|
|
91
|
+
labels.update({
|
|
92
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud
|
|
93
|
+
})
|
|
94
|
+
else:
|
|
95
|
+
logger.warning(f'Skipping unsupported ephemeral volume type: '
|
|
96
|
+
f'{volume_type} for cloud {cloud}.')
|
|
97
|
+
return None
|
|
98
|
+
volume_name = volume_config.name
|
|
99
|
+
volume_server_core.volume_apply(
|
|
100
|
+
name=volume_name,
|
|
101
|
+
volume_type=volume_type,
|
|
102
|
+
cloud=provider_name,
|
|
103
|
+
region=region,
|
|
104
|
+
zone=None,
|
|
105
|
+
size=volume_config.size,
|
|
106
|
+
config=internal_volume_config,
|
|
107
|
+
labels=labels,
|
|
108
|
+
is_ephemeral=True,
|
|
109
|
+
)
|
|
110
|
+
volume = global_user_state.get_volume_by_name(volume_name)
|
|
111
|
+
if volume is None:
|
|
112
|
+
raise ValueError(f'Failed to get record for volume: {volume_name}')
|
|
113
|
+
assert 'handle' in volume, 'Volume handle is None.'
|
|
114
|
+
volume_config: models.VolumeConfig = volume['handle']
|
|
115
|
+
volume_info = volume_utils.VolumeInfo(
|
|
116
|
+
name=volume_name,
|
|
117
|
+
path=path,
|
|
118
|
+
volume_name_on_cloud=volume_config.name_on_cloud,
|
|
119
|
+
volume_id_on_cloud=volume_config.id_on_cloud,
|
|
120
|
+
)
|
|
121
|
+
return volume_info
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def provision_ephemeral_volumes(
|
|
125
|
+
cloud: clouds.Cloud,
|
|
126
|
+
region: str,
|
|
127
|
+
cluster_name_on_cloud: str,
|
|
128
|
+
config: provision_common.ProvisionConfig,
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Provision ephemeral volumes for a cluster."""
|
|
131
|
+
provider_config = config.provider_config
|
|
132
|
+
ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
|
|
133
|
+
if not ephemeral_volume_mounts:
|
|
134
|
+
return
|
|
135
|
+
volume_infos = []
|
|
136
|
+
try:
|
|
137
|
+
for ephemeral_volume_mount in ephemeral_volume_mounts:
|
|
138
|
+
mount_copy = copy.deepcopy(ephemeral_volume_mount)
|
|
139
|
+
volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
|
|
140
|
+
volume_info = _create_ephemeral_volume(cloud, region,
|
|
141
|
+
cluster_name_on_cloud,
|
|
142
|
+
config, volume_mount)
|
|
143
|
+
if volume_info is None:
|
|
144
|
+
continue
|
|
145
|
+
volume_infos.append(volume_info)
|
|
146
|
+
provider_config['ephemeral_volume_infos'] = volume_infos
|
|
147
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
148
|
+
logger.error(f'Failed to provision ephemeral volumes: {e}')
|
|
149
|
+
raise e
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def delete_ephemeral_volumes(provider_config: Dict[str, Any],) -> None:
|
|
153
|
+
"""Provision ephemeral volumes for a cluster."""
|
|
154
|
+
ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
|
|
155
|
+
if not ephemeral_volume_mounts:
|
|
156
|
+
return
|
|
157
|
+
ephemeral_volume_names = []
|
|
158
|
+
for ephemeral_volume_mount in ephemeral_volume_mounts:
|
|
159
|
+
mount_copy = copy.deepcopy(ephemeral_volume_mount)
|
|
160
|
+
volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
|
|
161
|
+
volume_name = volume_mount.volume_config.name
|
|
162
|
+
ephemeral_volume_names.append(volume_name)
|
|
163
|
+
volume_server_core.volume_delete(names=ephemeral_volume_names,
|
|
164
|
+
ignore_not_found=True)
|
|
@@ -89,5 +89,6 @@ def create_unverified_session(session, suppress_warning=True):
|
|
|
89
89
|
session.verify = False
|
|
90
90
|
if suppress_warning:
|
|
91
91
|
# Suppress unverified https request warnings
|
|
92
|
-
requests.packages.urllib3.disable_warnings(
|
|
92
|
+
requests.packages.urllib3.disable_warnings( # type: ignore
|
|
93
|
+
InsecureRequestWarning)
|
|
93
94
|
return session
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import re
|
|
5
5
|
import subprocess
|
|
6
6
|
import time
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List, Union
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import vsphere as vsphere_adaptor
|
|
@@ -15,7 +15,8 @@ DISPLAY_CONTROLLER_CLASS_ID_PREFIXES = ['03']
|
|
|
15
15
|
VMWARE_VIRTUAL_DISPLAY_CONTROLLER_IDS = ['0000:00:0f.0']
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def get_objs_by_names(content, vimtype: type,
|
|
18
|
+
def get_objs_by_names(content, vimtype: Union[type, List[type]],
|
|
19
|
+
names: List[str]):
|
|
19
20
|
""" Get the vsphere managed object associated with a given text name
|
|
20
21
|
"""
|
|
21
22
|
# Create a set for the names for faster lookups
|
|
@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
|
|
|
30
30
|
WORKER_NODE_VALUE = '0'
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def run_instances(region: str, cluster_name: str,
|
|
33
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
34
34
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
35
35
|
"""See sky/provision/__init__.py"""
|
|
36
|
+
del cluster_name # unused
|
|
36
37
|
logger.info('New provision of Vsphere: run_instances().')
|
|
37
38
|
|
|
38
39
|
resumed_instance_ids: List[str] = []
|
|
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
40
41
|
vc_object = _get_vc_object(region)
|
|
41
42
|
vc_object.connect()
|
|
42
43
|
|
|
43
|
-
exist_instances = _get_filtered_instance(vc_object,
|
|
44
|
+
exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
|
|
44
45
|
config.provider_config)
|
|
45
46
|
head_instance_id = _get_head_instance_id(exist_instances)
|
|
46
47
|
|
|
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
|
|
|
89
90
|
config, region, vc_object)
|
|
90
91
|
# TODO: update logic for multi-node creation
|
|
91
92
|
for _ in range(to_start_num):
|
|
92
|
-
created_instance_uuid = _create_instances(
|
|
93
|
-
region, vc_object,
|
|
93
|
+
created_instance_uuid = _create_instances(cluster_name_on_cloud,
|
|
94
|
+
config, region, vc_object,
|
|
94
95
|
vsphere_cluster_name)
|
|
95
96
|
created_instance_ids.append(created_instance_uuid)
|
|
96
97
|
if head_instance_id is None:
|
|
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
|
|
|
104
105
|
provider_name='vsphere',
|
|
105
106
|
region=region,
|
|
106
107
|
zone=vsphere_cluster_name,
|
|
107
|
-
cluster_name=
|
|
108
|
+
cluster_name=cluster_name_on_cloud,
|
|
108
109
|
head_instance_id=head_instance_id,
|
|
109
110
|
resumed_instance_ids=resumed_instance_ids,
|
|
110
111
|
created_instance_ids=created_instance_ids,
|
|
@@ -397,9 +398,10 @@ def query_instances(
|
|
|
397
398
|
cluster_name_on_cloud: str,
|
|
398
399
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
399
400
|
non_terminated_only: bool = True,
|
|
401
|
+
retry_if_missing: bool = False,
|
|
400
402
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
401
403
|
"""See sky/provision/__init__.py"""
|
|
402
|
-
del cluster_name # unused
|
|
404
|
+
del cluster_name, retry_if_missing # unused
|
|
403
405
|
logger.info('New provision of Vsphere: query_instances().')
|
|
404
406
|
assert provider_config is not None, cluster_name_on_cloud
|
|
405
407
|
region = provider_config['region']
|
|
@@ -262,6 +262,10 @@ class VsphereClient:
|
|
|
262
262
|
|
|
263
263
|
def get_pbm_manager(self):
|
|
264
264
|
self.connect()
|
|
265
|
+
if self.servicemanager is None:
|
|
266
|
+
raise VsphereError('Failed to connect to vSphere.')
|
|
267
|
+
if self.servicemanager.si is None:
|
|
268
|
+
raise VsphereError('Failed to connect to vSphere.')
|
|
265
269
|
pbm_si, pm_content = self._create_pbm_connection( # pylint: disable=unused-variable
|
|
266
270
|
self.servicemanager.si._stub) # pylint: disable=protected-access
|
|
267
271
|
pm = pm_content.profileManager
|
|
@@ -360,6 +364,8 @@ def initialize_vsphere_data():
|
|
|
360
364
|
vcenter_name = vcenter['name']
|
|
361
365
|
vc_object.connect()
|
|
362
366
|
vc_servicemanager = vc_object.servicemanager
|
|
367
|
+
if vc_servicemanager is None or vc_servicemanager.content is None:
|
|
368
|
+
raise VsphereError('Failed to connect to vSphere.')
|
|
363
369
|
vc_content = vc_servicemanager.content
|
|
364
370
|
|
|
365
371
|
cluster_name_dicts = vc_object.clusters
|
|
@@ -370,4 +376,5 @@ def initialize_vsphere_data():
|
|
|
370
376
|
initialize_images_csv(images_csv_path, vc_object, vcenter_name)
|
|
371
377
|
initialize_instance_image_mapping_csv(vms_csv_path, images_csv_path,
|
|
372
378
|
instance_image_mapping_csv_path)
|
|
373
|
-
vc_object.servicemanager
|
|
379
|
+
if vc_object.servicemanager is not None:
|
|
380
|
+
vc_object.servicemanager.disconnect()
|
sky/resources.py
CHANGED
|
@@ -1104,7 +1104,7 @@ class Resources:
|
|
|
1104
1104
|
regions = self.cloud.regions_with_offering(self._instance_type,
|
|
1105
1105
|
self.accelerators,
|
|
1106
1106
|
self._use_spot, self._region,
|
|
1107
|
-
self._zone)
|
|
1107
|
+
self._zone, self)
|
|
1108
1108
|
if self._image_id is not None and None not in self._image_id:
|
|
1109
1109
|
regions = [r for r in regions if r.name in self._image_id]
|
|
1110
1110
|
|
|
@@ -1331,10 +1331,18 @@ class Resources:
|
|
|
1331
1331
|
clouds.CloudImplementationFeatures.IMAGE_ID
|
|
1332
1332
|
})
|
|
1333
1333
|
except exceptions.NotSupportedError as e:
|
|
1334
|
+
# Provide a more helpful error message for Lambda cloud
|
|
1335
|
+
if self.cloud.is_same_cloud(clouds.Lambda()):
|
|
1336
|
+
with ux_utils.print_exception_no_traceback():
|
|
1337
|
+
raise ValueError(
|
|
1338
|
+
'Lambda cloud only supports Docker images. '
|
|
1339
|
+
'Please prefix your image with "docker:" '
|
|
1340
|
+
'(e.g., image_id: docker:your-image-name).') from e
|
|
1334
1341
|
with ux_utils.print_exception_no_traceback():
|
|
1335
1342
|
raise ValueError(
|
|
1336
1343
|
'image_id is only supported for AWS/GCP/Azure/IBM/OCI/'
|
|
1337
|
-
'Kubernetes,
|
|
1344
|
+
'Kubernetes. For Lambda cloud, use "docker:" prefix for '
|
|
1345
|
+
'Docker images.') from e
|
|
1338
1346
|
|
|
1339
1347
|
if self._region is not None:
|
|
1340
1348
|
# If the image_id has None as key (region-agnostic),
|
|
@@ -1516,7 +1524,7 @@ class Resources:
|
|
|
1516
1524
|
if self.accelerators is not None:
|
|
1517
1525
|
hourly_cost += self.cloud.accelerators_to_hourly_cost(
|
|
1518
1526
|
self.accelerators, self.use_spot, self._region, self._zone)
|
|
1519
|
-
return hourly_cost * hours
|
|
1527
|
+
return float(hourly_cost * hours)
|
|
1520
1528
|
|
|
1521
1529
|
def get_accelerators_str(self) -> str:
|
|
1522
1530
|
accelerators = self.accelerators
|
sky/schemas/api/responses.py
CHANGED
|
@@ -5,8 +5,11 @@ from typing import Any, Dict, List, Optional
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
|
|
8
|
+
from sky import data
|
|
8
9
|
from sky import models
|
|
10
|
+
from sky.jobs import state as job_state
|
|
9
11
|
from sky.server import common
|
|
12
|
+
from sky.skylet import job_lib
|
|
10
13
|
from sky.utils import status_lib
|
|
11
14
|
|
|
12
15
|
|
|
@@ -74,8 +77,13 @@ class APIHealthResponse(ResponseBaseModel):
|
|
|
74
77
|
version: str = ''
|
|
75
78
|
version_on_disk: str = ''
|
|
76
79
|
commit: str = ''
|
|
80
|
+
# Whether basic auth on api server is enabled
|
|
77
81
|
basic_auth_enabled: bool = False
|
|
78
82
|
user: Optional[models.User] = None
|
|
83
|
+
# Whether service account token is enabled
|
|
84
|
+
service_account_token_enabled: bool = False
|
|
85
|
+
# Whether basic auth on ingress is enabled
|
|
86
|
+
ingress_basic_auth_enabled: bool = False
|
|
79
87
|
|
|
80
88
|
|
|
81
89
|
class StatusResponse(ResponseBaseModel):
|
|
@@ -86,8 +94,8 @@ class StatusResponse(ResponseBaseModel):
|
|
|
86
94
|
# backends.ResourceHandle, so we use Any here.
|
|
87
95
|
# This is an internally facing field anyway, so it's less
|
|
88
96
|
# of a problem that it's not typed.
|
|
89
|
-
handle: Any
|
|
90
|
-
last_use: str
|
|
97
|
+
handle: Optional[Any] = None
|
|
98
|
+
last_use: Optional[str] = None
|
|
91
99
|
status: status_lib.ClusterStatus
|
|
92
100
|
autostop: int
|
|
93
101
|
to_down: bool
|
|
@@ -95,11 +103,8 @@ class StatusResponse(ResponseBaseModel):
|
|
|
95
103
|
# metadata is a JSON, so we use Any here.
|
|
96
104
|
metadata: Optional[Dict[str, Any]] = None
|
|
97
105
|
cluster_hash: str
|
|
98
|
-
# pydantic cannot generate the pydantic-core schema for
|
|
99
|
-
# storage_mounts_metadata, so we use Any here.
|
|
100
|
-
storage_mounts_metadata: Optional[Dict[str, Any]] = None
|
|
101
106
|
cluster_ever_up: bool
|
|
102
|
-
status_updated_at: int
|
|
107
|
+
status_updated_at: Optional[int] = None
|
|
103
108
|
user_hash: str
|
|
104
109
|
user_name: str
|
|
105
110
|
config_hash: Optional[str] = None
|
|
@@ -118,9 +123,105 @@ class StatusResponse(ResponseBaseModel):
|
|
|
118
123
|
cpus: Optional[str] = None
|
|
119
124
|
memory: Optional[str] = None
|
|
120
125
|
accelerators: Optional[str] = None
|
|
126
|
+
cluster_name_on_cloud: Optional[str] = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ClusterJobRecord(ResponseBaseModel):
|
|
130
|
+
"""Response for the cluster job queue endpoint."""
|
|
131
|
+
job_id: int
|
|
132
|
+
job_name: str
|
|
133
|
+
username: str
|
|
134
|
+
user_hash: str
|
|
135
|
+
submitted_at: float
|
|
136
|
+
# None if the job has not started yet.
|
|
137
|
+
start_at: Optional[float] = None
|
|
138
|
+
# None if the job has not ended yet.
|
|
139
|
+
end_at: Optional[float] = None
|
|
140
|
+
resources: str
|
|
141
|
+
status: job_lib.JobStatus
|
|
142
|
+
log_path: str
|
|
143
|
+
metadata: Dict[str, Any] = {}
|
|
121
144
|
|
|
122
145
|
|
|
123
146
|
class UploadStatus(enum.Enum):
|
|
124
147
|
"""Status of the upload."""
|
|
125
148
|
UPLOADING = 'uploading'
|
|
126
149
|
COMPLETED = 'completed'
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class StorageRecord(ResponseBaseModel):
|
|
153
|
+
"""Response for the storage list endpoint."""
|
|
154
|
+
name: str
|
|
155
|
+
launched_at: int
|
|
156
|
+
store: List[data.StoreType]
|
|
157
|
+
last_use: str
|
|
158
|
+
status: status_lib.StorageStatus
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# TODO (syang) figure out which fields are always present
|
|
162
|
+
# and therefore can be non-optional.
|
|
163
|
+
class ManagedJobRecord(ResponseBaseModel):
|
|
164
|
+
"""A single managed job record."""
|
|
165
|
+
# The job_id in the spot table
|
|
166
|
+
task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
|
|
167
|
+
job_id: Optional[int] = None
|
|
168
|
+
task_id: Optional[int] = None
|
|
169
|
+
job_name: Optional[str] = None
|
|
170
|
+
task_name: Optional[str] = None
|
|
171
|
+
job_duration: Optional[float] = None
|
|
172
|
+
workspace: Optional[str] = None
|
|
173
|
+
status: Optional[job_state.ManagedJobStatus] = None
|
|
174
|
+
schedule_state: Optional[str] = None
|
|
175
|
+
resources: Optional[str] = None
|
|
176
|
+
cluster_resources: Optional[str] = None
|
|
177
|
+
cluster_resources_full: Optional[str] = None
|
|
178
|
+
cloud: Optional[str] = None
|
|
179
|
+
region: Optional[str] = None
|
|
180
|
+
zone: Optional[str] = None
|
|
181
|
+
infra: Optional[str] = None
|
|
182
|
+
recovery_count: Optional[int] = None
|
|
183
|
+
details: Optional[str] = None
|
|
184
|
+
failure_reason: Optional[str] = None
|
|
185
|
+
user_name: Optional[str] = None
|
|
186
|
+
user_hash: Optional[str] = None
|
|
187
|
+
submitted_at: Optional[float] = None
|
|
188
|
+
start_at: Optional[float] = None
|
|
189
|
+
end_at: Optional[float] = None
|
|
190
|
+
user_yaml: Optional[str] = None
|
|
191
|
+
entrypoint: Optional[str] = None
|
|
192
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
193
|
+
controller_pid: Optional[int] = None
|
|
194
|
+
controller_pid_started_at: Optional[float] = None
|
|
195
|
+
dag_yaml_path: Optional[str] = None
|
|
196
|
+
env_file_path: Optional[str] = None
|
|
197
|
+
last_recovered_at: Optional[float] = None
|
|
198
|
+
run_timestamp: Optional[str] = None
|
|
199
|
+
priority: Optional[int] = None
|
|
200
|
+
original_user_yaml_path: Optional[str] = None
|
|
201
|
+
pool: Optional[str] = None
|
|
202
|
+
pool_hash: Optional[str] = None
|
|
203
|
+
current_cluster_name: Optional[str] = None
|
|
204
|
+
job_id_on_pool_cluster: Optional[int] = None
|
|
205
|
+
accelerators: Optional[Dict[str, int]] = None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class VolumeRecord(ResponseBaseModel):
|
|
209
|
+
"""A single volume record."""
|
|
210
|
+
name: str
|
|
211
|
+
type: str
|
|
212
|
+
launched_at: int
|
|
213
|
+
cloud: str
|
|
214
|
+
region: Optional[str] = None
|
|
215
|
+
zone: Optional[str] = None
|
|
216
|
+
size: Optional[str] = None
|
|
217
|
+
config: Dict[str, Any]
|
|
218
|
+
name_on_cloud: str
|
|
219
|
+
user_hash: str
|
|
220
|
+
user_name: str
|
|
221
|
+
workspace: str
|
|
222
|
+
last_attached_at: Optional[int] = None
|
|
223
|
+
last_use: Optional[str] = None
|
|
224
|
+
status: Optional[str] = None
|
|
225
|
+
usedby_pods: List[str]
|
|
226
|
+
usedby_clusters: List[str]
|
|
227
|
+
is_ephemeral: bool = False
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add skylet_ssh_tunnel_metadata to clusters.
|
|
2
|
+
|
|
3
|
+
Revision ID: 008
|
|
4
|
+
Revises: 007
|
|
5
|
+
Create Date: 2025-09-09
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '008'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '007'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add skylet_ssh_tunnel_metadata column to clusters."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
27
|
+
'skylet_ssh_tunnel_metadata',
|
|
28
|
+
sa.LargeBinary(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No-op for backward compatibility."""
|
|
34
|
+
pass
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Add last_activity_time and launched_at to cluster history.
|
|
2
|
+
|
|
3
|
+
Revision ID: 009
|
|
4
|
+
Revises: 008
|
|
5
|
+
Create Date: 2025-09-24
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
import pickle
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
from sky.utils.db import db_utils
|
|
16
|
+
|
|
17
|
+
# revision identifiers, used by Alembic.
|
|
18
|
+
revision: str = '009'
|
|
19
|
+
down_revision: Union[str, Sequence[str], None] = '008'
|
|
20
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
21
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def upgrade():
|
|
25
|
+
"""Add last_activity_time and launched_at columns to cluster history."""
|
|
26
|
+
with op.get_context().autocommit_block():
|
|
27
|
+
# Add the columns with indices
|
|
28
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
29
|
+
'last_activity_time',
|
|
30
|
+
sa.Integer(),
|
|
31
|
+
server_default=None,
|
|
32
|
+
index=True)
|
|
33
|
+
|
|
34
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
35
|
+
'launched_at',
|
|
36
|
+
sa.Integer(),
|
|
37
|
+
server_default=None,
|
|
38
|
+
index=True)
|
|
39
|
+
|
|
40
|
+
# Populate the columns for existing rows
|
|
41
|
+
_populate_cluster_history_columns()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _populate_cluster_history_columns():
|
|
45
|
+
"""Populate last_activity_time and launched_at for existing rows using
|
|
46
|
+
usage_intervals logic."""
|
|
47
|
+
connection = op.get_bind()
|
|
48
|
+
|
|
49
|
+
# Get all existing rows with usage_intervals
|
|
50
|
+
result = connection.execute(
|
|
51
|
+
sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
|
|
52
|
+
'WHERE usage_intervals IS NOT NULL'))
|
|
53
|
+
|
|
54
|
+
for row in result:
|
|
55
|
+
cluster_hash = row[0]
|
|
56
|
+
usage_intervals_blob = row[1]
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
# Deserialize the usage_intervals
|
|
60
|
+
usage_intervals = pickle.loads(usage_intervals_blob)
|
|
61
|
+
|
|
62
|
+
if usage_intervals:
|
|
63
|
+
# Calculate last_activity_time: end time of last interval
|
|
64
|
+
# or start time if still running
|
|
65
|
+
last_interval = usage_intervals[-1]
|
|
66
|
+
last_activity_time = (last_interval[1] if last_interval[1]
|
|
67
|
+
is not None else last_interval[0])
|
|
68
|
+
|
|
69
|
+
# Calculate launched_at: start time of first interval
|
|
70
|
+
launched_at = usage_intervals[0][0]
|
|
71
|
+
|
|
72
|
+
# Update the row with both calculated values
|
|
73
|
+
connection.execute(
|
|
74
|
+
sa.text('UPDATE cluster_history '
|
|
75
|
+
'SET last_activity_time = :last_activity_time, '
|
|
76
|
+
'launched_at = :launched_at '
|
|
77
|
+
'WHERE cluster_hash = :cluster_hash'), {
|
|
78
|
+
'last_activity_time': last_activity_time,
|
|
79
|
+
'launched_at': launched_at,
|
|
80
|
+
'cluster_hash': cluster_hash
|
|
81
|
+
})
|
|
82
|
+
except (pickle.PickleError, AttributeError, IndexError):
|
|
83
|
+
# Skip rows with corrupted or invalid usage_intervals
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def downgrade():
|
|
88
|
+
"""No-op for backward compatibility."""
|
|
89
|
+
pass
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Add ssh keys in filesystem to global user state.
|
|
2
|
+
|
|
3
|
+
Revision ID: 010
|
|
4
|
+
Revises: 009
|
|
5
|
+
Create Date: 2025-10-07
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
import glob
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
import os
|
|
11
|
+
from typing import Sequence, Union
|
|
12
|
+
|
|
13
|
+
from alembic import op
|
|
14
|
+
import sqlalchemy as sa
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '010'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '009'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add last_activity_time and launched_at columns to cluster history."""
|
|
25
|
+
connection = op.get_bind()
|
|
26
|
+
|
|
27
|
+
match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
|
|
28
|
+
file_user_hashes = set()
|
|
29
|
+
for match_dir in match_dirs:
|
|
30
|
+
user_hash = match_dir.split('/')[-2]
|
|
31
|
+
file_user_hashes.add(user_hash)
|
|
32
|
+
|
|
33
|
+
# Get all existing ssh keys
|
|
34
|
+
existing_user_hashes = set()
|
|
35
|
+
result = connection.execute(sa.text('SELECT user_hash FROM ssh_key'))
|
|
36
|
+
for row in result:
|
|
37
|
+
existing_user_hashes.add(row[0])
|
|
38
|
+
|
|
39
|
+
user_hashes_to_add = file_user_hashes - existing_user_hashes
|
|
40
|
+
for user_hash in user_hashes_to_add:
|
|
41
|
+
match_dir = os.path.join(os.path.expanduser('~/.sky/clients'),
|
|
42
|
+
user_hash, 'ssh')
|
|
43
|
+
public_key_path = os.path.join(match_dir, 'sky-key.pub')
|
|
44
|
+
private_key_path = os.path.join(match_dir, 'sky-key')
|
|
45
|
+
try:
|
|
46
|
+
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
47
|
+
public_key = f.read().strip()
|
|
48
|
+
with open(private_key_path, 'r', encoding='utf-8') as f:
|
|
49
|
+
private_key = f.read().strip()
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
# Skip if the key files are not found
|
|
52
|
+
continue
|
|
53
|
+
connection.execute(
|
|
54
|
+
sa.text('INSERT INTO ssh_key '
|
|
55
|
+
'(user_hash, ssh_public_key, ssh_private_key) '
|
|
56
|
+
'VALUES (:user_hash, :ssh_public_key, :ssh_private_key) '
|
|
57
|
+
'ON CONFLICT DO NOTHING'), {
|
|
58
|
+
'user_hash': user_hash,
|
|
59
|
+
'ssh_public_key': public_key,
|
|
60
|
+
'ssh_private_key': private_key
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def downgrade():
|
|
65
|
+
"""No-op for backward compatibility."""
|
|
66
|
+
pass
|