skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/utils/locks.py
CHANGED
|
@@ -11,15 +11,19 @@ import time
|
|
|
11
11
|
from typing import Any, Optional
|
|
12
12
|
|
|
13
13
|
import filelock
|
|
14
|
+
import psycopg2
|
|
14
15
|
import sqlalchemy
|
|
15
16
|
|
|
16
17
|
from sky import global_user_state
|
|
17
|
-
from sky.skylet import
|
|
18
|
+
from sky.skylet import runtime_utils
|
|
18
19
|
from sky.utils import common_utils
|
|
19
20
|
from sky.utils.db import db_utils
|
|
20
21
|
|
|
21
22
|
logger = logging.getLogger(__name__)
|
|
22
23
|
|
|
24
|
+
# The directory for file locks.
|
|
25
|
+
SKY_LOCKS_DIR = runtime_utils.get_runtime_dir_path('.sky/locks')
|
|
26
|
+
|
|
23
27
|
|
|
24
28
|
class LockTimeout(RuntimeError):
|
|
25
29
|
"""Raised when a lock acquisition times out."""
|
|
@@ -126,9 +130,8 @@ class FileLock(DistributedLock):
|
|
|
126
130
|
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
127
131
|
"""
|
|
128
132
|
super().__init__(lock_id, timeout, poll_interval)
|
|
129
|
-
os.makedirs(
|
|
130
|
-
self.lock_path = os.path.join(
|
|
131
|
-
f'.{lock_id}.lock')
|
|
133
|
+
os.makedirs(SKY_LOCKS_DIR, exist_ok=True)
|
|
134
|
+
self.lock_path = os.path.join(SKY_LOCKS_DIR, f'.{lock_id}.lock')
|
|
132
135
|
if timeout is None:
|
|
133
136
|
timeout = -1
|
|
134
137
|
self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
|
|
@@ -154,7 +157,7 @@ class FileLock(DistributedLock):
|
|
|
154
157
|
common_utils.remove_file_if_exists(self.lock_path)
|
|
155
158
|
|
|
156
159
|
def is_locked(self) -> bool:
|
|
157
|
-
return self._filelock.is_locked
|
|
160
|
+
return self._filelock.is_locked
|
|
158
161
|
|
|
159
162
|
|
|
160
163
|
class PostgresLock(DistributedLock):
|
|
@@ -162,15 +165,20 @@ class PostgresLock(DistributedLock):
|
|
|
162
165
|
|
|
163
166
|
Uses PostgreSQL advisory locks to implement distributed locking
|
|
164
167
|
that works across multiple machines sharing the same database.
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
+
Supports both exclusive and shared lock modes.
|
|
169
|
+
|
|
170
|
+
References:
|
|
171
|
+
# pylint: disable=line-too-long
|
|
172
|
+
- https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
|
|
173
|
+
- https://www.postgresql.org/docs/current/functions-admin.html#FUNCTIONS-ADVISORY-LOCKS
|
|
174
|
+
# TODO(cooperc): re-enable pylint line-too-long
|
|
168
175
|
"""
|
|
169
176
|
|
|
170
177
|
def __init__(self,
|
|
171
178
|
lock_id: str,
|
|
172
179
|
timeout: Optional[float] = None,
|
|
173
|
-
poll_interval: float = 1
|
|
180
|
+
poll_interval: float = 1,
|
|
181
|
+
shared_lock: bool = False):
|
|
174
182
|
"""Initialize the postgres lock.
|
|
175
183
|
|
|
176
184
|
Args:
|
|
@@ -178,10 +186,13 @@ class PostgresLock(DistributedLock):
|
|
|
178
186
|
timeout: Maximum time to wait for lock acquisition.
|
|
179
187
|
poll_interval: Interval in seconds to poll for lock acquisition,
|
|
180
188
|
default to 1 second to avoid storming the database.
|
|
189
|
+
shared_lock: Whether to use shared advisory lock or exclusive
|
|
190
|
+
advisory lock (default).
|
|
181
191
|
"""
|
|
182
192
|
super().__init__(lock_id, timeout, poll_interval)
|
|
183
193
|
# Convert string lock_id to integer for postgres advisory locks
|
|
184
194
|
self._lock_key = self._string_to_lock_key(lock_id)
|
|
195
|
+
self._shared_lock = shared_lock
|
|
185
196
|
self._acquired = False
|
|
186
197
|
self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
|
|
187
198
|
|
|
@@ -197,6 +208,7 @@ class PostgresLock(DistributedLock):
|
|
|
197
208
|
if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
198
209
|
raise ValueError('PostgresLock requires PostgreSQL database. '
|
|
199
210
|
f'Current dialect: {engine.dialect.name}')
|
|
211
|
+
# Borrow a dedicated connection from the pool.
|
|
200
212
|
return engine.raw_connection()
|
|
201
213
|
|
|
202
214
|
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
@@ -209,33 +221,37 @@ class PostgresLock(DistributedLock):
|
|
|
209
221
|
|
|
210
222
|
start_time = time.time()
|
|
211
223
|
|
|
224
|
+
if self._shared_lock:
|
|
225
|
+
lock_func = 'pg_try_advisory_lock_shared'
|
|
226
|
+
else:
|
|
227
|
+
lock_func = 'pg_try_advisory_lock'
|
|
228
|
+
|
|
212
229
|
try:
|
|
213
230
|
while True:
|
|
214
|
-
cursor.execute('SELECT
|
|
215
|
-
(self._lock_key,))
|
|
231
|
+
cursor.execute(f'SELECT {lock_func}(%s)', (self._lock_key,))
|
|
216
232
|
result = cursor.fetchone()[0]
|
|
217
233
|
|
|
218
234
|
if result:
|
|
219
235
|
self._acquired = True
|
|
220
236
|
return AcquireReturnProxy(self)
|
|
221
237
|
|
|
238
|
+
mode_str = ('shared' if self._shared_lock else 'exclusive')
|
|
222
239
|
if not blocking:
|
|
223
240
|
raise LockTimeout(
|
|
224
|
-
f'Failed to immediately acquire
|
|
225
|
-
f'{self.lock_id}')
|
|
241
|
+
f'Failed to immediately acquire {mode_str} '
|
|
242
|
+
f'postgres lock {self.lock_id}')
|
|
226
243
|
|
|
227
244
|
if (self.timeout is not None and
|
|
228
245
|
time.time() - start_time > self.timeout):
|
|
229
246
|
raise LockTimeout(
|
|
230
|
-
f'Failed to acquire postgres lock
|
|
231
|
-
f'within {self.timeout}
|
|
247
|
+
f'Failed to acquire {mode_str} postgres lock '
|
|
248
|
+
f'{self.lock_id} within {self.timeout} '
|
|
249
|
+
f'seconds')
|
|
232
250
|
|
|
233
251
|
time.sleep(self.poll_interval)
|
|
234
252
|
|
|
235
253
|
except Exception:
|
|
236
|
-
|
|
237
|
-
self._connection.close()
|
|
238
|
-
self._connection = None
|
|
254
|
+
self._close_connection()
|
|
239
255
|
raise
|
|
240
256
|
|
|
241
257
|
def release(self) -> None:
|
|
@@ -243,32 +259,94 @@ class PostgresLock(DistributedLock):
|
|
|
243
259
|
if not self._acquired or not self._connection:
|
|
244
260
|
return
|
|
245
261
|
|
|
262
|
+
connection_lost = False
|
|
246
263
|
try:
|
|
247
264
|
cursor = self._connection.cursor()
|
|
248
|
-
|
|
265
|
+
if self._shared_lock:
|
|
266
|
+
unlock_func = 'pg_advisory_unlock_shared'
|
|
267
|
+
else:
|
|
268
|
+
unlock_func = 'pg_advisory_unlock'
|
|
269
|
+
cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
|
|
249
270
|
self._connection.commit()
|
|
250
271
|
self._acquired = False
|
|
272
|
+
except psycopg2.OperationalError as e:
|
|
273
|
+
# Lost connection to the database, likely the lock is force unlocked
|
|
274
|
+
# by other routines.
|
|
275
|
+
logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
|
|
276
|
+
connection_lost = True
|
|
251
277
|
finally:
|
|
252
|
-
if
|
|
253
|
-
|
|
254
|
-
|
|
278
|
+
# Invalidate if connection was lost to prevent SQLAlchemy from
|
|
279
|
+
# trying to reset a dead connection
|
|
280
|
+
self._close_connection(invalidate=connection_lost)
|
|
255
281
|
|
|
256
282
|
def force_unlock(self) -> None:
|
|
257
283
|
"""Force unlock the postgres advisory lock."""
|
|
258
284
|
try:
|
|
259
|
-
|
|
285
|
+
# The lock is held by current routine, gracefully unlock it
|
|
286
|
+
if self._acquired:
|
|
287
|
+
self.release()
|
|
288
|
+
return
|
|
289
|
+
|
|
290
|
+
# The lock is held by another routine, force unlock it.
|
|
291
|
+
if self._connection is None:
|
|
260
292
|
self._connection = self._get_connection()
|
|
261
293
|
cursor = self._connection.cursor()
|
|
262
|
-
|
|
263
|
-
|
|
294
|
+
if self._shared_lock:
|
|
295
|
+
unlock_func = 'pg_advisory_unlock_shared'
|
|
296
|
+
else:
|
|
297
|
+
unlock_func = 'pg_advisory_unlock'
|
|
298
|
+
|
|
299
|
+
cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
|
|
300
|
+
result = cursor.fetchone()[0]
|
|
301
|
+
if result:
|
|
302
|
+
# The lock is held by current routine and unlock succeed
|
|
303
|
+
self._connection.commit()
|
|
304
|
+
self._acquired = False
|
|
305
|
+
return
|
|
306
|
+
cursor.execute(
|
|
307
|
+
('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
|
|
308
|
+
'AND ((classid::bigint << 32) | objid::bigint) = %s'),
|
|
309
|
+
(self._lock_key,))
|
|
310
|
+
rows = cursor.fetchall()
|
|
311
|
+
if rows:
|
|
312
|
+
# There can be multiple PIDs holding the lock, it is not enough
|
|
313
|
+
# to only kill some of them. For example, if pid 1 is holding a
|
|
314
|
+
# shared lock, and pid 2 is waiting to grab an exclusive lock,
|
|
315
|
+
# killing pid 1 will transfer the lock to pid 2, so the lock
|
|
316
|
+
# will still not be released.
|
|
317
|
+
for row in rows:
|
|
318
|
+
cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
|
|
319
|
+
self._connection.commit()
|
|
320
|
+
return
|
|
264
321
|
except Exception as e:
|
|
265
322
|
raise RuntimeError(
|
|
266
323
|
f'Failed to force unlock postgres lock {self.lock_id}: {e}'
|
|
267
324
|
) from e
|
|
268
325
|
finally:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
326
|
+
self._close_connection()
|
|
327
|
+
|
|
328
|
+
def _close_connection(self, invalidate: bool = False) -> None:
|
|
329
|
+
"""Close the postgres connection.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
invalidate: If True, invalidate connection instead of closing it.
|
|
333
|
+
Use this when the connection might be broken (e.g., after
|
|
334
|
+
pg_terminate_backend) to prevent SQLAlchemy from trying to
|
|
335
|
+
reset it (which would result in an error being logged).
|
|
336
|
+
"""
|
|
337
|
+
if self._connection:
|
|
338
|
+
try:
|
|
339
|
+
if invalidate:
|
|
340
|
+
self._connection.invalidate()
|
|
341
|
+
else:
|
|
342
|
+
self._connection.close()
|
|
343
|
+
except Exception as e: # pylint: disable=broad-except
|
|
344
|
+
if invalidate:
|
|
345
|
+
logger.debug(
|
|
346
|
+
f'Failed to invalidate postgres connection: {e}')
|
|
347
|
+
else:
|
|
348
|
+
logger.debug(f'Failed to close postgres connection: {e}')
|
|
349
|
+
self._connection = None
|
|
272
350
|
|
|
273
351
|
def is_locked(self) -> bool:
|
|
274
352
|
"""Check if the postgres advisory lock is acquired."""
|
|
@@ -278,7 +356,8 @@ class PostgresLock(DistributedLock):
|
|
|
278
356
|
def get_lock(lock_id: str,
|
|
279
357
|
timeout: Optional[float] = None,
|
|
280
358
|
lock_type: Optional[str] = None,
|
|
281
|
-
poll_interval: Optional[float] = None
|
|
359
|
+
poll_interval: Optional[float] = None,
|
|
360
|
+
shared_lock: bool = False) -> DistributedLock:
|
|
282
361
|
"""Create a distributed lock instance.
|
|
283
362
|
|
|
284
363
|
Args:
|
|
@@ -287,6 +366,9 @@ def get_lock(lock_id: str,
|
|
|
287
366
|
None means wait indefinitely.
|
|
288
367
|
lock_type: Type of lock to create ('filelock' or 'postgres').
|
|
289
368
|
If None, auto-detect based on database configuration.
|
|
369
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
370
|
+
shared_lock: Whether to use shared lock or exclusive lock (default).
|
|
371
|
+
NOTE: Only applicable for PostgresLock.
|
|
290
372
|
|
|
291
373
|
Returns:
|
|
292
374
|
DistributedLock instance.
|
|
@@ -296,9 +378,24 @@ def get_lock(lock_id: str,
|
|
|
296
378
|
|
|
297
379
|
if lock_type == 'postgres':
|
|
298
380
|
if poll_interval is None:
|
|
299
|
-
return PostgresLock(lock_id, timeout)
|
|
300
|
-
return PostgresLock(lock_id,
|
|
381
|
+
return PostgresLock(lock_id, timeout, shared_lock=shared_lock)
|
|
382
|
+
return PostgresLock(lock_id,
|
|
383
|
+
timeout,
|
|
384
|
+
poll_interval,
|
|
385
|
+
shared_lock=shared_lock)
|
|
301
386
|
elif lock_type == 'filelock':
|
|
387
|
+
# The filelock library we use does not support shared locks.
|
|
388
|
+
# It explicitly uses fcntl.LOCK_EX on Unix systems,
|
|
389
|
+
# whereas fcntl.LOCK_SH is needed for shared locks.
|
|
390
|
+
|
|
391
|
+
# This should be fine as it should not introduce correctness issues,
|
|
392
|
+
# just that concurrency is reduced and so is performance, because
|
|
393
|
+
# read-only operations can't run at the same time, each of them need
|
|
394
|
+
# to wait to exclusively hold the lock.
|
|
395
|
+
|
|
396
|
+
# But given that we recommend users to use Postgres in production,
|
|
397
|
+
# the impact of this should be limited to local API server mostly.
|
|
398
|
+
del shared_lock
|
|
302
399
|
if poll_interval is None:
|
|
303
400
|
return FileLock(lock_id, timeout)
|
|
304
401
|
return FileLock(lock_id, timeout, poll_interval)
|
sky/utils/log_utils.py
CHANGED
|
@@ -198,325 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
|
|
|
198
198
|
self.status_display.stop()
|
|
199
199
|
|
|
200
200
|
|
|
201
|
-
class SkyRemoteUpLineProcessor(LineProcessor):
|
|
202
|
-
"""A processor for deploy_remote_cluster.py log lines."""
|
|
203
|
-
|
|
204
|
-
def __init__(self, log_path: str, is_local: bool):
|
|
205
|
-
self.log_path = log_path
|
|
206
|
-
self.is_local = is_local
|
|
207
|
-
|
|
208
|
-
def __enter__(self) -> None:
|
|
209
|
-
# TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
|
|
210
|
-
# messages.
|
|
211
|
-
status = rich_utils.safe_status(
|
|
212
|
-
ux_utils.spinner_message('Creating remote cluster',
|
|
213
|
-
log_path=self.log_path,
|
|
214
|
-
is_local=self.is_local))
|
|
215
|
-
self.status_display = status
|
|
216
|
-
self.status_display.start()
|
|
217
|
-
|
|
218
|
-
def process_line(self, log_line: str) -> None:
|
|
219
|
-
# Pre-flight checks
|
|
220
|
-
if 'SSH connection successful' in log_line:
|
|
221
|
-
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
|
|
222
|
-
f'{colorama.Style.RESET_ALL}')
|
|
223
|
-
|
|
224
|
-
# Kubernetes installation steps
|
|
225
|
-
if 'Deploying Kubernetes on head node' in log_line:
|
|
226
|
-
self.status_display.update(
|
|
227
|
-
ux_utils.spinner_message(
|
|
228
|
-
'Creating remote cluster - '
|
|
229
|
-
'deploying Kubernetes on head node',
|
|
230
|
-
log_path=self.log_path,
|
|
231
|
-
is_local=self.is_local))
|
|
232
|
-
if 'K3s deployed on head node.' in log_line:
|
|
233
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
234
|
-
'✔ K3s successfully deployed on head node.'
|
|
235
|
-
f'{colorama.Style.RESET_ALL}')
|
|
236
|
-
|
|
237
|
-
# Worker nodes
|
|
238
|
-
if 'Deploying Kubernetes on worker node' in log_line:
|
|
239
|
-
self.status_display.update(
|
|
240
|
-
ux_utils.spinner_message(
|
|
241
|
-
'Creating remote cluster - '
|
|
242
|
-
'deploying Kubernetes on worker nodes',
|
|
243
|
-
log_path=self.log_path,
|
|
244
|
-
is_local=self.is_local))
|
|
245
|
-
if 'Kubernetes deployed on worker node' in log_line:
|
|
246
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
247
|
-
'✔ K3s successfully deployed on worker node.'
|
|
248
|
-
f'{colorama.Style.RESET_ALL}')
|
|
249
|
-
|
|
250
|
-
# Cluster configuration
|
|
251
|
-
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
|
252
|
-
self.status_display.update(
|
|
253
|
-
ux_utils.spinner_message(
|
|
254
|
-
'Creating remote cluster - '
|
|
255
|
-
'configuring local kubectl',
|
|
256
|
-
log_path=self.log_path,
|
|
257
|
-
is_local=self.is_local))
|
|
258
|
-
if 'kubectl configured to connect to the cluster.' in log_line:
|
|
259
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
260
|
-
'✔ kubectl configured for the remote cluster.'
|
|
261
|
-
f'{colorama.Style.RESET_ALL}')
|
|
262
|
-
|
|
263
|
-
# GPU operator installation
|
|
264
|
-
if 'Installing Nvidia GPU Operator...' in log_line:
|
|
265
|
-
self.status_display.update(
|
|
266
|
-
ux_utils.spinner_message(
|
|
267
|
-
'Creating remote cluster - '
|
|
268
|
-
'installing Nvidia GPU Operator',
|
|
269
|
-
log_path=self.log_path,
|
|
270
|
-
is_local=self.is_local))
|
|
271
|
-
if 'GPU Operator installed.' in log_line:
|
|
272
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
273
|
-
'✔ Nvidia GPU Operator installed successfully.'
|
|
274
|
-
f'{colorama.Style.RESET_ALL}')
|
|
275
|
-
|
|
276
|
-
# Cleanup steps
|
|
277
|
-
if 'Cleaning up head node' in log_line:
|
|
278
|
-
self.status_display.update(
|
|
279
|
-
ux_utils.spinner_message('Cleaning up head node',
|
|
280
|
-
log_path=self.log_path,
|
|
281
|
-
is_local=self.is_local))
|
|
282
|
-
if 'Cleaning up node' in log_line:
|
|
283
|
-
self.status_display.update(
|
|
284
|
-
ux_utils.spinner_message('Cleaning up worker node',
|
|
285
|
-
log_path=self.log_path,
|
|
286
|
-
is_local=self.is_local))
|
|
287
|
-
if 'cleaned up successfully' in log_line:
|
|
288
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
289
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
290
|
-
|
|
291
|
-
# Final status
|
|
292
|
-
if 'Cluster deployment completed.' in log_line:
|
|
293
|
-
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
|
|
294
|
-
f'{colorama.Style.RESET_ALL}')
|
|
295
|
-
|
|
296
|
-
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
297
|
-
except_value: Optional[BaseException],
|
|
298
|
-
traceback: Optional[types.TracebackType]) -> None:
|
|
299
|
-
del except_type, except_value, traceback # unused
|
|
300
|
-
self.status_display.stop()
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
class SkySSHUpLineProcessor(LineProcessor):
|
|
304
|
-
"""A processor for deploy_remote_cluster.py log lines for SSH clusters"""
|
|
305
|
-
|
|
306
|
-
def __init__(self, log_path: str, is_local: bool):
|
|
307
|
-
self.log_path = log_path
|
|
308
|
-
self.is_local = is_local
|
|
309
|
-
self.current_cluster: Optional[str] = None
|
|
310
|
-
self.is_cleanup_mode = False
|
|
311
|
-
|
|
312
|
-
def __enter__(self) -> None:
|
|
313
|
-
status = rich_utils.safe_status(
|
|
314
|
-
ux_utils.spinner_message('Preparing to set up SSH Node Pools',
|
|
315
|
-
log_path=self.log_path,
|
|
316
|
-
is_local=self.is_local))
|
|
317
|
-
self.status_display = status
|
|
318
|
-
self.status_display.start()
|
|
319
|
-
|
|
320
|
-
def process_line(self, log_line: str) -> None:
|
|
321
|
-
# Detect cleanup mode
|
|
322
|
-
if 'SKYPILOT_CLEANUP_MODE:' in log_line:
|
|
323
|
-
self.is_cleanup_mode = True
|
|
324
|
-
if self.current_cluster:
|
|
325
|
-
self.status_display.update(
|
|
326
|
-
ux_utils.spinner_message(
|
|
327
|
-
f'Cleaning up Node Pool: \\[{self.current_cluster}]',
|
|
328
|
-
log_path=self.log_path,
|
|
329
|
-
is_local=self.is_local))
|
|
330
|
-
|
|
331
|
-
# Cluster detection message
|
|
332
|
-
if 'SKYPILOT_CLUSTER_INFO:' in log_line:
|
|
333
|
-
clusters_part = log_line.split('SKYPILOT_CLUSTER_INFO:',
|
|
334
|
-
1)[1].strip()
|
|
335
|
-
if clusters_part.startswith('Found'):
|
|
336
|
-
logger.info(f'{colorama.Style.RESET_ALL}'
|
|
337
|
-
f'{colorama.Fore.CYAN}{clusters_part}'
|
|
338
|
-
f'{colorama.Style.RESET_ALL}')
|
|
339
|
-
|
|
340
|
-
# Current cluster being operated on
|
|
341
|
-
if 'SKYPILOT_CURRENT_CLUSTER:' in log_line:
|
|
342
|
-
self.current_cluster = log_line.split('SKYPILOT_CURRENT_CLUSTER:',
|
|
343
|
-
1)[1].strip()
|
|
344
|
-
|
|
345
|
-
if self.is_cleanup_mode:
|
|
346
|
-
self.status_display.update(
|
|
347
|
-
ux_utils.spinner_message(
|
|
348
|
-
f'Cleaning up Node Pool: {self.current_cluster}',
|
|
349
|
-
log_path=self.log_path,
|
|
350
|
-
is_local=self.is_local))
|
|
351
|
-
logger.info(f'{colorama.Fore.CYAN}\nCleaning up Node Pool: '
|
|
352
|
-
f'{self.current_cluster}{colorama.Style.RESET_ALL}')
|
|
353
|
-
else:
|
|
354
|
-
self.status_display.update(
|
|
355
|
-
ux_utils.spinner_message(
|
|
356
|
-
f'Deploying SkyPilot \\[{self.current_cluster}]',
|
|
357
|
-
log_path=self.log_path,
|
|
358
|
-
is_local=self.is_local))
|
|
359
|
-
logger.info(f'{colorama.Style.RESET_ALL}'
|
|
360
|
-
f'{colorama.Fore.CYAN}\nSetting up Node Pool: '
|
|
361
|
-
f'{self.current_cluster}{colorama.Style.RESET_ALL}')
|
|
362
|
-
|
|
363
|
-
# Handle cluster completion marker
|
|
364
|
-
if 'SKYPILOT_CLUSTER_COMPLETED:' in log_line:
|
|
365
|
-
if self.is_cleanup_mode:
|
|
366
|
-
logger.info(
|
|
367
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
|
368
|
-
f'✔ Node Pool {self.current_cluster} cleaned up '
|
|
369
|
-
f'successfully.{colorama.Style.RESET_ALL}')
|
|
370
|
-
else:
|
|
371
|
-
logger.info(
|
|
372
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
|
373
|
-
f'✔ Node Pool {self.current_cluster} deployed successfully.'
|
|
374
|
-
f'{colorama.Style.RESET_ALL}')
|
|
375
|
-
|
|
376
|
-
# Pre-flight checks
|
|
377
|
-
if 'Checking SSH connection to head node' in log_line:
|
|
378
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
379
|
-
'Checking SSH connection to head node...'
|
|
380
|
-
f'{colorama.Style.RESET_ALL}')
|
|
381
|
-
|
|
382
|
-
if log_line.startswith('SSH connection successful'):
|
|
383
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
384
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
385
|
-
'✔ SSH connection established to head node '
|
|
386
|
-
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
387
|
-
|
|
388
|
-
# Kubernetes installation steps
|
|
389
|
-
if 'Deploying Kubernetes on head node' in log_line:
|
|
390
|
-
current_cluster_str = f' \\[{self.current_cluster}]' if (
|
|
391
|
-
self.current_cluster) else ''
|
|
392
|
-
self.status_display.update(
|
|
393
|
-
ux_utils.spinner_message(
|
|
394
|
-
'Deploying SkyPilot runtime on head node'
|
|
395
|
-
f'{current_cluster_str}',
|
|
396
|
-
log_path=self.log_path,
|
|
397
|
-
is_local=self.is_local))
|
|
398
|
-
|
|
399
|
-
if 'K3s deployed on head node' in log_line:
|
|
400
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
401
|
-
logger.info(
|
|
402
|
-
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
403
|
-
f'✔ SkyPilot runtime successfully deployed on head node '
|
|
404
|
-
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
405
|
-
|
|
406
|
-
# Worker nodes
|
|
407
|
-
if 'Deploying Kubernetes on worker node' in log_line:
|
|
408
|
-
self.status_display.update(
|
|
409
|
-
ux_utils.spinner_message(
|
|
410
|
-
'Deploying SkyPilot runtime on worker nodes' +
|
|
411
|
-
(f' \\[{self.current_cluster}]'
|
|
412
|
-
if self.current_cluster else ''),
|
|
413
|
-
log_path=self.log_path,
|
|
414
|
-
is_local=self.is_local))
|
|
415
|
-
|
|
416
|
-
if 'Kubernetes deployed on worker node' in log_line:
|
|
417
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
418
|
-
logger.info(
|
|
419
|
-
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
420
|
-
'✔ SkyPilot runtime successfully deployed on worker node '
|
|
421
|
-
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
422
|
-
|
|
423
|
-
if 'Failed to deploy K3s on worker node' in log_line:
|
|
424
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
425
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
426
|
-
'✗ Failed to deploy K3s on worker node '
|
|
427
|
-
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
428
|
-
|
|
429
|
-
# Cluster configuration
|
|
430
|
-
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
|
431
|
-
self.status_display.update(
|
|
432
|
-
ux_utils.spinner_message('Setting up SkyPilot configuration' +
|
|
433
|
-
(f' \\[{self.current_cluster}]'
|
|
434
|
-
if self.current_cluster else ''),
|
|
435
|
-
log_path=self.log_path,
|
|
436
|
-
is_local=self.is_local))
|
|
437
|
-
|
|
438
|
-
if 'kubectl configured to connect to the cluster.' in log_line:
|
|
439
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
440
|
-
'✔ SkyPilot configuration complete.'
|
|
441
|
-
f'{colorama.Style.RESET_ALL}')
|
|
442
|
-
|
|
443
|
-
# GPU operator installation
|
|
444
|
-
if 'Installing Nvidia GPU Operator...' in log_line:
|
|
445
|
-
self.status_display.update(
|
|
446
|
-
ux_utils.spinner_message('Configuring Nvidia GPUs' +
|
|
447
|
-
(f' \\[{self.current_cluster}]'
|
|
448
|
-
if self.current_cluster else ''),
|
|
449
|
-
log_path=self.log_path,
|
|
450
|
-
is_local=self.is_local))
|
|
451
|
-
|
|
452
|
-
if 'GPU Operator installed.' in log_line:
|
|
453
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
454
|
-
'✔ Nvidia GPUs configured successfully.'
|
|
455
|
-
f'{colorama.Style.RESET_ALL}')
|
|
456
|
-
|
|
457
|
-
# Cleanup steps
|
|
458
|
-
if 'Cleaning up head node' in log_line:
|
|
459
|
-
self.status_display.update(
|
|
460
|
-
ux_utils.spinner_message('Cleaning up head node' +
|
|
461
|
-
(f' \\[{self.current_cluster}]'
|
|
462
|
-
if self.current_cluster else ''),
|
|
463
|
-
log_path=self.log_path,
|
|
464
|
-
is_local=self.is_local))
|
|
465
|
-
|
|
466
|
-
if 'Cleaning up worker node' in log_line:
|
|
467
|
-
self.status_display.update(
|
|
468
|
-
ux_utils.spinner_message('Cleaning up worker nodes' +
|
|
469
|
-
(f' \\[{self.current_cluster}]'
|
|
470
|
-
if self.current_cluster else ''),
|
|
471
|
-
log_path=self.log_path,
|
|
472
|
-
is_local=self.is_local))
|
|
473
|
-
|
|
474
|
-
# Handle node cleanup success messages
|
|
475
|
-
if 'Node' in log_line and 'cleaned up successfully' in log_line:
|
|
476
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
477
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
478
|
-
|
|
479
|
-
if 'Node' in log_line and 'Failed to clean up' in log_line:
|
|
480
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
481
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
482
|
-
|
|
483
|
-
if 'Failed to clean up worker node' in log_line:
|
|
484
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
485
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
486
|
-
|
|
487
|
-
# Final status for the cluster deployment
|
|
488
|
-
if 'Cluster deployment completed.' in log_line:
|
|
489
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
490
|
-
'✔ SkyPilot runtime is up.'
|
|
491
|
-
f'{colorama.Style.RESET_ALL}')
|
|
492
|
-
|
|
493
|
-
if 'Failed to deploy Kubernetes on the following nodes:' in log_line:
|
|
494
|
-
logger.info(log_line.strip())
|
|
495
|
-
|
|
496
|
-
if 'already exists in history. ' in log_line:
|
|
497
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
498
|
-
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.YELLOW}'
|
|
499
|
-
'✔ SkyPilot runtime already deployed on worker node '
|
|
500
|
-
f'{node_name}. Skipping.{colorama.Style.RESET_ALL}')
|
|
501
|
-
|
|
502
|
-
if 'Failed to setup TCP forwarding on head node' in log_line:
|
|
503
|
-
node_name = log_line.split('(')[-1].split(')')[0]
|
|
504
|
-
logger.info(
|
|
505
|
-
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
506
|
-
f'✗ Failed to setup TCP forwarding on head node {node_name}.'
|
|
507
|
-
f'{colorama.Style.RESET_ALL}')
|
|
508
|
-
|
|
509
|
-
if 'Error in deploying SSH Target' in log_line:
|
|
510
|
-
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
|
|
511
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
512
|
-
|
|
513
|
-
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
514
|
-
except_value: Optional[BaseException],
|
|
515
|
-
traceback: Optional[types.TracebackType]) -> None:
|
|
516
|
-
del except_type, except_value, traceback # unused
|
|
517
|
-
self.status_display.stop()
|
|
518
|
-
|
|
519
|
-
|
|
520
201
|
def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
|
|
521
202
|
"""Creates table with default style."""
|
|
522
203
|
border = kwargs.pop('border', False)
|
sky/utils/resource_checker.py
CHANGED
|
@@ -140,7 +140,7 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
|
140
140
|
|
|
141
141
|
def check_users_workspaces_active_resources(
|
|
142
142
|
user_ids: List[str],
|
|
143
|
-
workspace_names: List[str]) -> Tuple[str, List[str]]:
|
|
143
|
+
workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
|
|
144
144
|
"""Check if all the active clusters or managed jobs in workspaces
|
|
145
145
|
belong to the user_ids. If not, return the error message.
|
|
146
146
|
|
|
@@ -151,6 +151,7 @@ def check_users_workspaces_active_resources(
|
|
|
151
151
|
Returns:
|
|
152
152
|
resource_error_summary: str
|
|
153
153
|
missed_users_names: List[str]
|
|
154
|
+
missed_user_dict: Dict[str, str]
|
|
154
155
|
"""
|
|
155
156
|
all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
|
|
156
157
|
workspace_names)
|
|
@@ -187,14 +188,14 @@ def check_users_workspaces_active_resources(
|
|
|
187
188
|
if resource_errors:
|
|
188
189
|
resource_error_summary = ' and '.join(resource_errors)
|
|
189
190
|
missed_users_names = []
|
|
191
|
+
missed_user_dict = {}
|
|
190
192
|
if missed_users:
|
|
191
193
|
all_users = global_user_state.get_all_users()
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
return resource_error_summary, missed_users_names
|
|
194
|
+
for user in all_users:
|
|
195
|
+
if user.id in missed_users:
|
|
196
|
+
missed_users_names.append(user.name if user.name else user.id)
|
|
197
|
+
missed_user_dict[user.id] = user.name if user.name else user.id
|
|
198
|
+
return resource_error_summary, missed_users_names, missed_user_dict
|
|
198
199
|
|
|
199
200
|
|
|
200
201
|
def _get_active_resources_for_workspaces(
|
|
@@ -276,9 +277,11 @@ def _get_active_resources(
|
|
|
276
277
|
# pylint: disable=import-outside-toplevel
|
|
277
278
|
from sky.jobs.server import core as managed_jobs_core
|
|
278
279
|
try:
|
|
279
|
-
filtered_jobs, _, _, _ = managed_jobs_core.
|
|
280
|
-
|
|
281
|
-
|
|
280
|
+
filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
|
|
281
|
+
refresh=False,
|
|
282
|
+
skip_finished=True,
|
|
283
|
+
all_users=True,
|
|
284
|
+
fields=['job_id', 'user_hash', 'workspace'])
|
|
282
285
|
return filtered_jobs
|
|
283
286
|
except exceptions.ClusterNotUpError:
|
|
284
287
|
logger.warning('All jobs should be finished.')
|