skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""Prime Intellect instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from sky import exceptions
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.provision import common
|
|
8
|
+
from sky.provision.primeintellect import utils
|
|
9
|
+
from sky.utils import common_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
from sky.utils import ux_utils
|
|
12
|
+
|
|
13
|
+
# The maximum number of times to poll for the status of an operation.
|
|
14
|
+
POLL_INTERVAL = 5
|
|
15
|
+
MAX_POLLS = 60 // POLL_INTERVAL
|
|
16
|
+
# Terminating instances can take several minutes, so we increase the timeout
|
|
17
|
+
MAX_POLLS_FOR_UP_OR_TERMINATE = MAX_POLLS * 16
|
|
18
|
+
|
|
19
|
+
# status filters
|
|
20
|
+
# PROVISIONING, PENDING, ACTIVE, STOPPED, ERROR, DELETING, TERMINATED
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
# SSH connection readiness polling constants
|
|
25
|
+
SSH_CONN_MAX_RETRIES = 6
|
|
26
|
+
SSH_CONN_RETRY_INTERVAL_SECONDS = 10
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
|
30
|
+
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
|
31
|
+
client = utils.PrimeIntellectAPIClient()
|
|
32
|
+
instances = client.list_instances()
|
|
33
|
+
# TODO: verify names are we using it?
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head',
|
|
36
|
+
f'{cluster_name_on_cloud}-worker',
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
filtered_instances = {}
|
|
40
|
+
for instance in instances:
|
|
41
|
+
instance_id = instance['id']
|
|
42
|
+
if (status_filters is not None and
|
|
43
|
+
instance['status'] not in status_filters):
|
|
44
|
+
continue
|
|
45
|
+
instance_name = instance.get('name')
|
|
46
|
+
if instance_name and instance_name in possible_names:
|
|
47
|
+
filtered_instances[instance_id] = instance
|
|
48
|
+
return filtered_instances
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
52
|
+
client = utils.PrimeIntellectAPIClient()
|
|
53
|
+
return client.get_instance_details(instance_id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
57
|
+
head_instance_id = None
|
|
58
|
+
for inst_id, inst in instances.items():
|
|
59
|
+
if inst['name'].endswith('-head'):
|
|
60
|
+
head_instance_id = inst_id
|
|
61
|
+
break
|
|
62
|
+
return head_instance_id
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Helper is available as utils.parse_ssh_connection.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
|
+
"""Runs instances for the given cluster."""
|
|
71
|
+
del cluster_name # unused
|
|
72
|
+
pending_status = [
|
|
73
|
+
'PROVISIONING',
|
|
74
|
+
'PENDING',
|
|
75
|
+
]
|
|
76
|
+
newly_started_instances = _filter_instances(cluster_name_on_cloud,
|
|
77
|
+
pending_status)
|
|
78
|
+
client = utils.PrimeIntellectAPIClient()
|
|
79
|
+
|
|
80
|
+
while True:
|
|
81
|
+
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
82
|
+
if not instances:
|
|
83
|
+
break
|
|
84
|
+
instance_statuses = [
|
|
85
|
+
instance['status'] for instance in instances.values()
|
|
86
|
+
]
|
|
87
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready: '
|
|
88
|
+
f'{instance_statuses}')
|
|
89
|
+
time.sleep(POLL_INTERVAL)
|
|
90
|
+
|
|
91
|
+
exist_instances = _filter_instances(cluster_name_on_cloud,
|
|
92
|
+
status_filters=pending_status)
|
|
93
|
+
if len(exist_instances) > config.count:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
|
96
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
|
97
|
+
|
|
98
|
+
exist_instances = _filter_instances(cluster_name_on_cloud,
|
|
99
|
+
status_filters=['ACTIVE'])
|
|
100
|
+
head_instance_id = _get_head_instance_id(exist_instances)
|
|
101
|
+
to_start_count = config.count - len(exist_instances)
|
|
102
|
+
if to_start_count < 0:
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
|
105
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
|
106
|
+
if to_start_count == 0:
|
|
107
|
+
if head_instance_id is None:
|
|
108
|
+
head_instance_id = list(exist_instances.keys())[0]
|
|
109
|
+
# TODO: implement rename pod
|
|
110
|
+
# client.rename(
|
|
111
|
+
# instance_id=head_instance_id,
|
|
112
|
+
# name=f'{cluster_name_on_cloud}-head',
|
|
113
|
+
# )
|
|
114
|
+
assert head_instance_id is not None, (
|
|
115
|
+
'head_instance_id should not be None')
|
|
116
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
|
117
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
|
118
|
+
return common.ProvisionRecord(
|
|
119
|
+
provider_name='primeintellect',
|
|
120
|
+
cluster_name=cluster_name_on_cloud,
|
|
121
|
+
region=region,
|
|
122
|
+
zone=config.provider_config['zones'],
|
|
123
|
+
head_instance_id=head_instance_id,
|
|
124
|
+
resumed_instance_ids=list(newly_started_instances.keys()),
|
|
125
|
+
created_instance_ids=[],
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
created_instance_ids = []
|
|
129
|
+
for _ in range(to_start_count):
|
|
130
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
131
|
+
try:
|
|
132
|
+
# Extract vCPUs and memory from instance type
|
|
133
|
+
# Format: provider__gpu_prefix_base_type__vcpus__memory[_SPOT]
|
|
134
|
+
instance_type = config.node_config['InstanceType']
|
|
135
|
+
disk_size = config.node_config.get('DiskSize')
|
|
136
|
+
vcpus = -1
|
|
137
|
+
memory = -1
|
|
138
|
+
try:
|
|
139
|
+
# Split by '__'
|
|
140
|
+
parts = instance_type.split('__')
|
|
141
|
+
|
|
142
|
+
# Format: provider__gpu_info__vcpus__memory[_SPOT]
|
|
143
|
+
# For: primecompute__8xH100_80GB__104__752_SPOT
|
|
144
|
+
# parts[0] = primecompute, parts[1] = 8xH100_80GB,
|
|
145
|
+
# parts[2] = 104, parts[3] = 752, parts[4] = SPOT
|
|
146
|
+
if len(parts) >= 4:
|
|
147
|
+
vcpu_str = parts[2]
|
|
148
|
+
memory_str = parts[3]
|
|
149
|
+
vcpus = int(vcpu_str)
|
|
150
|
+
memory = int(memory_str)
|
|
151
|
+
except (ValueError, IndexError) as e:
|
|
152
|
+
# If parsing fails, try to get from catalog
|
|
153
|
+
logger.warning(
|
|
154
|
+
f'Failed to parse vCPUs/memory from instance type '
|
|
155
|
+
f'{instance_type}: {e}')
|
|
156
|
+
|
|
157
|
+
params = {
|
|
158
|
+
'name': f'{cluster_name_on_cloud}-{node_type}',
|
|
159
|
+
'instance_type': config.node_config['InstanceType'],
|
|
160
|
+
'region': region,
|
|
161
|
+
'availability_zone': config.provider_config['zones'],
|
|
162
|
+
'disk_size': disk_size,
|
|
163
|
+
'vcpus': vcpus,
|
|
164
|
+
'memory': memory,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
response = client.launch(**params)
|
|
168
|
+
instance_id = response['id']
|
|
169
|
+
except utils.PrimeintellectResourcesUnavailableError as e:
|
|
170
|
+
# Resource unavailability error - provide specific message
|
|
171
|
+
instance_type = config.node_config['InstanceType']
|
|
172
|
+
region_str = (f' in region {region}'
|
|
173
|
+
if region != 'PLACEHOLDER' else '')
|
|
174
|
+
error_msg = (
|
|
175
|
+
f'Resources are currently unavailable on Prime Intellect. '
|
|
176
|
+
f'No {instance_type} instances are available{region_str}. '
|
|
177
|
+
f'Please try again later or consider using a different '
|
|
178
|
+
f'instance type or region. Details: {str(e)}')
|
|
179
|
+
logger.warning(f'Resource unavailability error: {e}')
|
|
180
|
+
with ux_utils.print_exception_no_traceback():
|
|
181
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
182
|
+
except utils.PrimeintellectAPIError as e:
|
|
183
|
+
# Other API errors - provide specific message
|
|
184
|
+
instance_type = config.node_config['InstanceType']
|
|
185
|
+
region_str = (f' in region {region}'
|
|
186
|
+
if region != 'PLACEHOLDER' else '')
|
|
187
|
+
error_msg = (f'Failed to launch {instance_type} instance on Prime '
|
|
188
|
+
f'Intellect{region_str}. Details: {str(e)}')
|
|
189
|
+
logger.warning(f'API error during instance launch: {e}')
|
|
190
|
+
with ux_utils.print_exception_no_traceback():
|
|
191
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
192
|
+
except Exception as e: # pylint: disable=broad-except
|
|
193
|
+
# Generic error handling for unexpected errors
|
|
194
|
+
instance_type = config.node_config['InstanceType']
|
|
195
|
+
region_str = (f' in region {region}'
|
|
196
|
+
if region != 'PLACEHOLDER' else '')
|
|
197
|
+
error_msg = (
|
|
198
|
+
f'Unexpected error while launching {instance_type} instance '
|
|
199
|
+
f'on Prime Intellect{region_str}. Details: '
|
|
200
|
+
f'{common_utils.format_exception(e, use_bracket=False)}')
|
|
201
|
+
logger.warning(f'Unexpected error during instance launch: {e}')
|
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
|
203
|
+
raise exceptions.ResourcesUnavailableError(error_msg) from e
|
|
204
|
+
logger.info(f'Launched instance {instance_id}.')
|
|
205
|
+
created_instance_ids.append(instance_id)
|
|
206
|
+
if head_instance_id is None:
|
|
207
|
+
head_instance_id = instance_id
|
|
208
|
+
|
|
209
|
+
# Wait for instances to be ready.
|
|
210
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
|
|
211
|
+
instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
|
|
212
|
+
logger.info('Waiting for instances to be ready: '
|
|
213
|
+
f'({len(instances)}/{config.count}).')
|
|
214
|
+
if len(instances) == config.count:
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
time.sleep(POLL_INTERVAL)
|
|
218
|
+
else:
|
|
219
|
+
# Failed to launch config.count of instances after max retries
|
|
220
|
+
# Provide more specific error message
|
|
221
|
+
instance_type = config.node_config['InstanceType']
|
|
222
|
+
region_str = (f' in region {region}' if region != 'PLACEHOLDER' else '')
|
|
223
|
+
active_instances = len(
|
|
224
|
+
_filter_instances(cluster_name_on_cloud, ['ACTIVE']))
|
|
225
|
+
error_msg = (
|
|
226
|
+
f'Timed out waiting for {instance_type} instances to become '
|
|
227
|
+
f'ready on Prime Intellect{region_str}. Only {active_instances} '
|
|
228
|
+
f'out of {config.count} instances became active. This may '
|
|
229
|
+
f'indicate capacity issues or slow provisioning. Please try '
|
|
230
|
+
f'again later or consider using a different instance type or '
|
|
231
|
+
f'region.')
|
|
232
|
+
logger.warning(error_msg)
|
|
233
|
+
with ux_utils.print_exception_no_traceback():
|
|
234
|
+
raise exceptions.ResourcesUnavailableError(error_msg)
|
|
235
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
236
|
+
return common.ProvisionRecord(
|
|
237
|
+
provider_name='primeintellect',
|
|
238
|
+
cluster_name=cluster_name_on_cloud,
|
|
239
|
+
region=region,
|
|
240
|
+
zone=config.provider_config['zones'],
|
|
241
|
+
head_instance_id=head_instance_id,
|
|
242
|
+
resumed_instance_ids=[],
|
|
243
|
+
created_instance_ids=created_instance_ids,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
248
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
249
|
+
del region, cluster_name_on_cloud, state
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def stop_instances(
|
|
253
|
+
cluster_name_on_cloud: str,
|
|
254
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
255
|
+
worker_only: bool = False,
|
|
256
|
+
) -> None:
|
|
257
|
+
raise NotImplementedError()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def terminate_instances(
|
|
261
|
+
cluster_name_on_cloud: str,
|
|
262
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
263
|
+
worker_only: bool = False,
|
|
264
|
+
) -> None:
|
|
265
|
+
"""See sky/provision/__init__.py"""
|
|
266
|
+
del provider_config # unused
|
|
267
|
+
client = utils.PrimeIntellectAPIClient()
|
|
268
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
269
|
+
|
|
270
|
+
# Log if no instances found
|
|
271
|
+
if not instances:
|
|
272
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
# Filter out already terminated instances
|
|
276
|
+
non_terminated_instances = {
|
|
277
|
+
inst_id: inst
|
|
278
|
+
for inst_id, inst in instances.items()
|
|
279
|
+
if inst['status'] not in ['TERMINATED', 'DELETING']
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if not non_terminated_instances:
|
|
283
|
+
logger.info(
|
|
284
|
+
f'All instances for cluster {cluster_name_on_cloud} are already '
|
|
285
|
+
f'terminated or being deleted')
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
# Log what we're about to terminate
|
|
289
|
+
instance_names = [
|
|
290
|
+
inst['name'] for inst in non_terminated_instances.values()
|
|
291
|
+
]
|
|
292
|
+
logger.info(
|
|
293
|
+
f'Terminating {len(non_terminated_instances)} instances for cluster '
|
|
294
|
+
f'{cluster_name_on_cloud}: {instance_names}')
|
|
295
|
+
|
|
296
|
+
# Terminate each instance
|
|
297
|
+
terminated_instances = []
|
|
298
|
+
for inst_id, inst in non_terminated_instances.items():
|
|
299
|
+
status = inst['status']
|
|
300
|
+
logger.debug(f'Terminating instance {inst_id} (status: {status})')
|
|
301
|
+
if worker_only and inst['name'].endswith('-head'):
|
|
302
|
+
continue
|
|
303
|
+
try:
|
|
304
|
+
client.remove(inst_id)
|
|
305
|
+
terminated_instances.append(inst_id)
|
|
306
|
+
name = inst['name']
|
|
307
|
+
logger.info(
|
|
308
|
+
f'Successfully initiated termination of instance {inst_id} '
|
|
309
|
+
f'({name})')
|
|
310
|
+
except Exception as e: # pylint: disable=broad-except
|
|
311
|
+
with ux_utils.print_exception_no_traceback():
|
|
312
|
+
raise RuntimeError(
|
|
313
|
+
f'Failed to terminate instance {inst_id}: '
|
|
314
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
|
315
|
+
) from e
|
|
316
|
+
|
|
317
|
+
# Wait for instances to be terminated
|
|
318
|
+
if not terminated_instances:
|
|
319
|
+
logger.info(
|
|
320
|
+
'No instances were terminated (worker_only=True and only head '
|
|
321
|
+
'node found)')
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
logger.info(f'Waiting for {len(terminated_instances)} instances to be '
|
|
325
|
+
f'terminated...')
|
|
326
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_TERMINATE):
|
|
327
|
+
remaining_instances = _filter_instances(cluster_name_on_cloud, None)
|
|
328
|
+
|
|
329
|
+
# Check if all terminated instances are gone
|
|
330
|
+
still_exist = [
|
|
331
|
+
inst_id for inst_id in terminated_instances
|
|
332
|
+
if inst_id in remaining_instances
|
|
333
|
+
]
|
|
334
|
+
if not still_exist:
|
|
335
|
+
logger.info('All instances have been successfully terminated')
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
# Log status of remaining instances
|
|
339
|
+
remaining_statuses = [(inst_id, remaining_instances[inst_id]['status'])
|
|
340
|
+
for inst_id in still_exist]
|
|
341
|
+
logger.info(
|
|
342
|
+
f'Waiting for termination... {len(still_exist)} instances still '
|
|
343
|
+
f'exist: {remaining_statuses}')
|
|
344
|
+
time.sleep(POLL_INTERVAL)
|
|
345
|
+
else:
|
|
346
|
+
# Timeout reached
|
|
347
|
+
remaining_instances = _filter_instances(cluster_name_on_cloud, None)
|
|
348
|
+
still_exist = [
|
|
349
|
+
inst_id for inst_id in terminated_instances
|
|
350
|
+
if inst_id in remaining_instances
|
|
351
|
+
]
|
|
352
|
+
if still_exist:
|
|
353
|
+
logger.warning(
|
|
354
|
+
f'Timeout reached. {len(still_exist)} instances may still be '
|
|
355
|
+
f'terminating: {still_exist}')
|
|
356
|
+
else:
|
|
357
|
+
logger.info('All instances have been successfully terminated')
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def get_cluster_info(
|
|
361
|
+
region: str,
|
|
362
|
+
cluster_name_on_cloud: str,
|
|
363
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
364
|
+
del region # unused
|
|
365
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['ACTIVE'])
|
|
366
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
|
367
|
+
head_instance_id = None
|
|
368
|
+
head_ssh_user = None
|
|
369
|
+
for instance_id, instance in running_instances.items():
|
|
370
|
+
retry_count = 0
|
|
371
|
+
max_retries = SSH_CONN_MAX_RETRIES
|
|
372
|
+
while (instance.get('sshConnection') is None and
|
|
373
|
+
retry_count < max_retries):
|
|
374
|
+
name = instance.get('name')
|
|
375
|
+
print(f'SSH connection to {name} is not ready, waiting '
|
|
376
|
+
f'{SSH_CONN_RETRY_INTERVAL_SECONDS} seconds... '
|
|
377
|
+
f'(attempt {retry_count + 1}/{max_retries})')
|
|
378
|
+
time.sleep(SSH_CONN_RETRY_INTERVAL_SECONDS)
|
|
379
|
+
retry_count += 1
|
|
380
|
+
running_instances[instance_id] = _get_instance_info(instance_id)
|
|
381
|
+
|
|
382
|
+
if instance.get('sshConnection') is not None:
|
|
383
|
+
print('SSH connection is ready!')
|
|
384
|
+
else:
|
|
385
|
+
raise Exception(
|
|
386
|
+
f'Failed to establish SSH connection after {max_retries} '
|
|
387
|
+
f'attempts')
|
|
388
|
+
|
|
389
|
+
assert instance.get(
|
|
390
|
+
'sshConnection'), 'sshConnection cannot be null anymore'
|
|
391
|
+
|
|
392
|
+
ssh_connection = instance['sshConnection']
|
|
393
|
+
_, ssh_port = utils.parse_ssh_connection(ssh_connection)
|
|
394
|
+
|
|
395
|
+
external_ip = instance['ip']
|
|
396
|
+
if isinstance(external_ip, list):
|
|
397
|
+
external_ip = external_ip[0]
|
|
398
|
+
|
|
399
|
+
instances[instance_id] = [
|
|
400
|
+
common.InstanceInfo(
|
|
401
|
+
instance_id=instance_id,
|
|
402
|
+
internal_ip='NOT_SUPPORTED',
|
|
403
|
+
external_ip=external_ip,
|
|
404
|
+
ssh_port=ssh_port,
|
|
405
|
+
tags={'provider': instance['providerType']},
|
|
406
|
+
)
|
|
407
|
+
]
|
|
408
|
+
if instance['name'].endswith('-head'):
|
|
409
|
+
head_instance_id = instance_id
|
|
410
|
+
parsed_user_for_user, _ = utils.parse_ssh_connection(ssh_connection)
|
|
411
|
+
head_ssh_user = parsed_user_for_user or 'ubuntu'
|
|
412
|
+
|
|
413
|
+
return common.ClusterInfo(
|
|
414
|
+
instances=instances,
|
|
415
|
+
head_instance_id=head_instance_id,
|
|
416
|
+
provider_name='primeintellect',
|
|
417
|
+
provider_config=provider_config,
|
|
418
|
+
ssh_user=head_ssh_user,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def query_instances(
|
|
423
|
+
cluster_name_on_cloud: str,
|
|
424
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
425
|
+
non_terminated_only: bool = True,
|
|
426
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
427
|
+
"""See sky/provision/__init__.py"""
|
|
428
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
429
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
430
|
+
|
|
431
|
+
status_map = {
|
|
432
|
+
'PENDING': status_lib.ClusterStatus.INIT,
|
|
433
|
+
'ERROR': status_lib.ClusterStatus.INIT,
|
|
434
|
+
'ACTIVE': status_lib.ClusterStatus.UP,
|
|
435
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
436
|
+
'DELETING': None, # Being deleted - should be filtered out
|
|
437
|
+
'TERMINATED': None, # Already terminated - should be filtered out
|
|
438
|
+
}
|
|
439
|
+
statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
|
|
440
|
+
Optional[str]]] = {}
|
|
441
|
+
for inst_id, inst in instances.items():
|
|
442
|
+
status = status_map[inst['status']]
|
|
443
|
+
if non_terminated_only and status is None:
|
|
444
|
+
continue
|
|
445
|
+
statuses[inst_id] = (status, None)
|
|
446
|
+
return statuses
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def cleanup_ports(
|
|
450
|
+
cluster_name_on_cloud: str,
|
|
451
|
+
ports: List[str],
|
|
452
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
453
|
+
) -> None:
|
|
454
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|