skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""Prime Intellect library wrapper for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import shlex
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
import uuid
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from sky.catalog import common as catalog_common
|
|
13
|
+
from sky.utils import common_utils
|
|
14
|
+
|
|
15
|
+
_df = None
|
|
16
|
+
_lookup_dict = None
|
|
17
|
+
|
|
18
|
+
# Base URL for Prime Intellect API (used as default if not configured).
|
|
19
|
+
DEFAULT_BASE_URL = 'https://api.primeintellect.ai'
|
|
20
|
+
CREDENTIALS_PATH = '~/.prime/config.json'
|
|
21
|
+
INITIAL_BACKOFF_SECONDS = 10
|
|
22
|
+
MAX_BACKOFF_FACTOR = 10
|
|
23
|
+
MAX_ATTEMPTS = 6
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PrimeintellectAPIError(Exception):
|
|
27
|
+
"""Base exception for Prime Intellect API errors."""
|
|
28
|
+
|
|
29
|
+
def __init__(self,
|
|
30
|
+
message: str,
|
|
31
|
+
status_code: Optional[int] = None,
|
|
32
|
+
response_data: Optional[Dict[str, Any]] = None):
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.status_code = status_code
|
|
35
|
+
self.response_data = response_data
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PrimeintellectResourcesUnavailableError(PrimeintellectAPIError):
|
|
39
|
+
"""Exception for when resources are unavailable on Prime Intellect."""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_api_error(response: Any) -> Tuple[str, bool]:
|
|
44
|
+
"""Parse API error response to extract meaningful error messages.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tuple[str, bool]:
|
|
48
|
+
- str: A human-readable error message parsed from the API response.
|
|
49
|
+
- bool: True if the error indicates resource unavailability (e.g.,
|
|
50
|
+
capacity issues or quota/limit exceeded), otherwise False.
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
if hasattr(response, 'json'):
|
|
54
|
+
error_data = response.json()
|
|
55
|
+
else:
|
|
56
|
+
error_data = response
|
|
57
|
+
|
|
58
|
+
if isinstance(error_data, dict):
|
|
59
|
+
# Try to extract error message from common error response fields
|
|
60
|
+
error_message = error_data.get('message', '')
|
|
61
|
+
if not error_message:
|
|
62
|
+
error_message = error_data.get('error', '')
|
|
63
|
+
if not error_message:
|
|
64
|
+
error_message = error_data.get('detail', '')
|
|
65
|
+
|
|
66
|
+
# Check if it's a resource unavailability error
|
|
67
|
+
if any(keyword in error_message.lower() for keyword in [
|
|
68
|
+
'no capacity', 'capacity', 'unavailable', 'out of stock',
|
|
69
|
+
'insufficient', 'not available', 'quota exceeded',
|
|
70
|
+
'limit exceeded'
|
|
71
|
+
]):
|
|
72
|
+
return error_message, True
|
|
73
|
+
|
|
74
|
+
return error_message, False
|
|
75
|
+
|
|
76
|
+
return str(error_data), False
|
|
77
|
+
except Exception: # pylint: disable=broad-except
|
|
78
|
+
return f'HTTP {response.status_code} {response.reason}', False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _try_request_with_backoff(
|
|
82
|
+
method: str,
|
|
83
|
+
url: str,
|
|
84
|
+
headers: Dict[str, str],
|
|
85
|
+
data: Optional[Union[str, Dict[str, Any]]] = None) -> Dict[str, Any]:
|
|
86
|
+
backoff = common_utils.Backoff(initial_backoff=INITIAL_BACKOFF_SECONDS,
|
|
87
|
+
max_backoff_factor=MAX_BACKOFF_FACTOR)
|
|
88
|
+
for i in range(MAX_ATTEMPTS):
|
|
89
|
+
timeout = 30
|
|
90
|
+
if method == 'get':
|
|
91
|
+
response = requests.get(url,
|
|
92
|
+
headers=headers,
|
|
93
|
+
params=data,
|
|
94
|
+
timeout=timeout)
|
|
95
|
+
elif method == 'post':
|
|
96
|
+
response = requests.post(url,
|
|
97
|
+
headers=headers,
|
|
98
|
+
json=data,
|
|
99
|
+
timeout=timeout)
|
|
100
|
+
elif method == 'put':
|
|
101
|
+
response = requests.put(url,
|
|
102
|
+
headers=headers,
|
|
103
|
+
json=data,
|
|
104
|
+
timeout=timeout)
|
|
105
|
+
elif method == 'patch':
|
|
106
|
+
response = requests.patch(url,
|
|
107
|
+
headers=headers,
|
|
108
|
+
json=data,
|
|
109
|
+
timeout=timeout)
|
|
110
|
+
elif method == 'delete':
|
|
111
|
+
response = requests.delete(url, headers=headers, timeout=timeout)
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f'Unsupported requests method: {method}')
|
|
114
|
+
# If rate limited, wait and try again
|
|
115
|
+
if response.status_code == 429 and i != MAX_ATTEMPTS - 1:
|
|
116
|
+
time.sleep(backoff.current_backoff())
|
|
117
|
+
continue
|
|
118
|
+
if response.ok:
|
|
119
|
+
return response.json()
|
|
120
|
+
else:
|
|
121
|
+
# Parse the error response for meaningful messages
|
|
122
|
+
err, is_resource_unavailable = _parse_api_error(response)
|
|
123
|
+
|
|
124
|
+
# Create a more informative error message
|
|
125
|
+
if not err:
|
|
126
|
+
err = (f'API request failed: {method} {url}: '
|
|
127
|
+
f'{response.status_code} {response.reason}')
|
|
128
|
+
else:
|
|
129
|
+
err = f'API request failed: {err}'
|
|
130
|
+
|
|
131
|
+
# Raise appropriate exception based on error type
|
|
132
|
+
if is_resource_unavailable:
|
|
133
|
+
raise PrimeintellectResourcesUnavailableError(
|
|
134
|
+
err,
|
|
135
|
+
status_code=response.status_code,
|
|
136
|
+
response_data=response.json()
|
|
137
|
+
if hasattr(response, 'json') else None)
|
|
138
|
+
else:
|
|
139
|
+
raise PrimeintellectAPIError(
|
|
140
|
+
err,
|
|
141
|
+
status_code=response.status_code,
|
|
142
|
+
response_data=response.json()
|
|
143
|
+
if hasattr(response, 'json') else None)
|
|
144
|
+
return {}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_upstream_cloud_id(instance_type: str) -> Optional[str]:
|
|
148
|
+
global _df, _lookup_dict
|
|
149
|
+
if _df is None:
|
|
150
|
+
_df = catalog_common.read_catalog('primeintellect/vms.csv')
|
|
151
|
+
_lookup_dict = (
|
|
152
|
+
_df.set_index('InstanceType')['UpstreamCloudId'].to_dict())
|
|
153
|
+
return _lookup_dict.get(instance_type)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class PrimeIntellectAPIClient:
|
|
157
|
+
"""Client for interacting with Prime Intellect API."""
|
|
158
|
+
|
|
159
|
+
def __init__(self) -> None:
|
|
160
|
+
self.credentials = os.path.expanduser(CREDENTIALS_PATH)
|
|
161
|
+
assert os.path.exists(self.credentials), 'Credentials not found'
|
|
162
|
+
with open(self.credentials, 'r', encoding='utf-8') as f:
|
|
163
|
+
self._credentials = json.load(f)
|
|
164
|
+
self.api_key = self._credentials.get('api_key')
|
|
165
|
+
self.team_id = self._credentials.get('team_id')
|
|
166
|
+
self.base_url = self._credentials.get('base_url', DEFAULT_BASE_URL)
|
|
167
|
+
self.headers = {
|
|
168
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
169
|
+
'Content-Type': 'application/json'
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
def list_instances(self, **search_kwargs) -> List[Dict[str, Any]]:
|
|
173
|
+
response = _try_request_with_backoff('get',
|
|
174
|
+
f'{self.base_url}/api/v1/pods',
|
|
175
|
+
headers=self.headers,
|
|
176
|
+
data=search_kwargs)
|
|
177
|
+
return response['data']
|
|
178
|
+
|
|
179
|
+
def get_instance_details(self, instance_id: str) -> Dict[str, Any]:
|
|
180
|
+
return _try_request_with_backoff(
|
|
181
|
+
'get',
|
|
182
|
+
f'{self.base_url}/api/v1/pods/{instance_id}',
|
|
183
|
+
headers=self.headers)
|
|
184
|
+
|
|
185
|
+
def launch(self,
|
|
186
|
+
name: str,
|
|
187
|
+
instance_type: str,
|
|
188
|
+
region: str,
|
|
189
|
+
availability_zone: str,
|
|
190
|
+
disk_size: int,
|
|
191
|
+
vcpus: int = 0,
|
|
192
|
+
memory: int = 0) -> Dict[str, Any]:
|
|
193
|
+
"""Create a pod/instance via Prime Intellect API.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
name: User-visible name of the pod.
|
|
197
|
+
instance_type: A catalog instance type string. The expected format
|
|
198
|
+
is:
|
|
199
|
+
"<provider>__<accelerator>__<vcpus>__<memory>[_SPOT]".
|
|
200
|
+
|
|
201
|
+
- <provider>: Upstream provider tag (e.g., "primecompute").
|
|
202
|
+
- <accelerator>:
|
|
203
|
+
* GPU nodes: "<N>x<GPU_MODEL>", e.g., "8xH100_80GB".
|
|
204
|
+
* CPU-only nodes: the literal "CPU_NODE".
|
|
205
|
+
- <vcpus>: Integer string for vCPU count (e.g., "104").
|
|
206
|
+
- <memory>: Integer string for memory in GB (e.g., "752").
|
|
207
|
+
- Optional suffix "_SPOT" may be present in the full string
|
|
208
|
+
(ignored here; pricing/spot behavior is not controlled by
|
|
209
|
+
this method).
|
|
210
|
+
|
|
211
|
+
Notes:
|
|
212
|
+
- Parsing: only the first two components (provider,
|
|
213
|
+
accelerator) are needed to build the payload. The vCPU
|
|
214
|
+
and memory values are provided via the ``vcpus`` and
|
|
215
|
+
``memory`` arguments.
|
|
216
|
+
- Catalog lookup: the full instance_type string is used to
|
|
217
|
+
map to the catalog's UpstreamCloudId.
|
|
218
|
+
- CPU-only: accelerator "CPU_NODE" is a sentinel for
|
|
219
|
+
"no GPU". We set gpuType='CPU_NODE' and gpuCount=1 to
|
|
220
|
+
represent CPU-only pods.
|
|
221
|
+
- Spot: the optional "__SPOT" suffix (if present) is ignored
|
|
222
|
+
here; pricing/spot behavior is handled elsewhere.
|
|
223
|
+
|
|
224
|
+
region: Country/region code used by Prime Intellect.
|
|
225
|
+
availability_zone: Data center ID (zone) within the region.
|
|
226
|
+
disk_size: Boot disk size in GB.
|
|
227
|
+
vcpus: Optional explicit vCPU override; if >0 it will be sent.
|
|
228
|
+
memory: Optional explicit memory override in GB; if >0 it will be
|
|
229
|
+
sent.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
The API response JSON as a dict.
|
|
233
|
+
"""
|
|
234
|
+
cloud_id = get_upstream_cloud_id(instance_type)
|
|
235
|
+
assert cloud_id, 'cloudId cannot be None'
|
|
236
|
+
assert availability_zone, 'availability_zone cannot be None'
|
|
237
|
+
|
|
238
|
+
# Parse the instance_type. We only need the first two components:
|
|
239
|
+
# provider and accelerator info (see docstring above).
|
|
240
|
+
provider, gpu_parts, _, _ = instance_type.split('__', 3)
|
|
241
|
+
if 'CPU_NODE' in gpu_parts:
|
|
242
|
+
# Prime Intellect API uses the same schema for CPU-only and GPU
|
|
243
|
+
# pods. For CPU-only instances, we set gpuType='CPU_NODE' and
|
|
244
|
+
# gpuCount=1 as a sentinel to indicate "no GPU". This is how CPU
|
|
245
|
+
# instances are represented internally on our platform; the
|
|
246
|
+
# backend does not interpret this as having a physical GPU.
|
|
247
|
+
gpu_type = 'CPU_NODE'
|
|
248
|
+
gpu_count = 1
|
|
249
|
+
else:
|
|
250
|
+
parts = gpu_parts.split('x', 1)
|
|
251
|
+
gpu_count = int(parts[0])
|
|
252
|
+
gpu_type = parts[1]
|
|
253
|
+
|
|
254
|
+
payload: Dict[str, Any] = {
|
|
255
|
+
'pod': {
|
|
256
|
+
'name': name,
|
|
257
|
+
'cloudId': cloud_id,
|
|
258
|
+
'socket': 'PCIe',
|
|
259
|
+
'gpuType': gpu_type,
|
|
260
|
+
'gpuCount': int(gpu_count),
|
|
261
|
+
'diskSize': disk_size,
|
|
262
|
+
# Prime Intellect API historically required maxPrice.
|
|
263
|
+
# Set to 0 to indicate on-demand/non-spot pricing.
|
|
264
|
+
'maxPrice': 0,
|
|
265
|
+
},
|
|
266
|
+
'provider': {
|
|
267
|
+
'type': provider,
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if vcpus > 0:
|
|
272
|
+
payload['pod']['vcpus'] = vcpus
|
|
273
|
+
if memory > 0:
|
|
274
|
+
payload['pod']['memory'] = memory
|
|
275
|
+
|
|
276
|
+
if region != 'UNSPECIFIED':
|
|
277
|
+
payload['pod']['country'] = region
|
|
278
|
+
if availability_zone != 'UNSPECIFIED':
|
|
279
|
+
payload['pod']['dataCenterId'] = availability_zone
|
|
280
|
+
|
|
281
|
+
if self.team_id is not None and self.team_id != '':
|
|
282
|
+
payload['team'] = {'teamId': self.team_id}
|
|
283
|
+
|
|
284
|
+
response = _try_request_with_backoff(
|
|
285
|
+
'post',
|
|
286
|
+
f'{self.base_url}/api/v1/pods',
|
|
287
|
+
headers=self.headers,
|
|
288
|
+
data=payload,
|
|
289
|
+
)
|
|
290
|
+
return response
|
|
291
|
+
|
|
292
|
+
def remove(self, instance_id: str) -> Dict[str, Any]:
|
|
293
|
+
return _try_request_with_backoff(
|
|
294
|
+
'delete',
|
|
295
|
+
f'{self.base_url}/api/v1/pods/{instance_id}',
|
|
296
|
+
headers=self.headers,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def list_ssh_keys(self) -> List[Dict[str, Any]]:
|
|
300
|
+
response = _try_request_with_backoff('get',
|
|
301
|
+
f'{self.base_url}/api/v1/ssh_keys',
|
|
302
|
+
headers=self.headers)
|
|
303
|
+
return response['data']
|
|
304
|
+
|
|
305
|
+
def get_or_add_ssh_key(self, ssh_pub_key: str = '') -> Dict[str, str]:
|
|
306
|
+
"""Add ssh key if not already added."""
|
|
307
|
+
# Check if the public key is already added
|
|
308
|
+
ssh_keys = self.list_ssh_keys()
|
|
309
|
+
for key in ssh_keys:
|
|
310
|
+
if key['publicKey'].strip().split()[:2] == ssh_pub_key.strip(
|
|
311
|
+
).split()[:2]:
|
|
312
|
+
return {'name': key['name'], 'ssh_key': ssh_pub_key}
|
|
313
|
+
|
|
314
|
+
# Add the public key to Prime Intellect account if not already added
|
|
315
|
+
ssh_key_name = 'skypilot-' + str(uuid.uuid4()).replace('-', '')[:8]
|
|
316
|
+
_try_request_with_backoff(
|
|
317
|
+
'post',
|
|
318
|
+
f'{self.base_url}/api/v1/ssh_keys',
|
|
319
|
+
headers=self.headers,
|
|
320
|
+
data={
|
|
321
|
+
'name': ssh_key_name,
|
|
322
|
+
'publicKey': ssh_pub_key
|
|
323
|
+
},
|
|
324
|
+
)
|
|
325
|
+
return {'name': ssh_key_name, 'ssh_key': ssh_pub_key}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def parse_ssh_connection(ssh_connection: Any) -> Tuple[Optional[str], int]:
|
|
329
|
+
"""Parse and extract SSH username and port from a connection field.
|
|
330
|
+
|
|
331
|
+
The provider may return the SSH connection in multiple shapes. This helper
|
|
332
|
+
robustly extracts the SSH username and port while tolerating extra flags or
|
|
333
|
+
various tokenizations.
|
|
334
|
+
|
|
335
|
+
Accepted formats (examples):
|
|
336
|
+
- String with port flag:
|
|
337
|
+
"ubuntu@1.2.3.4 -p 2222 [-o <flag> ...]"
|
|
338
|
+
- String without explicit port (defaults to 22):
|
|
339
|
+
"ubuntu@1.2.3.4"
|
|
340
|
+
- String with host:port:
|
|
341
|
+
"ubuntu@1.2.3.4:2222"
|
|
342
|
+
- List with a single target:
|
|
343
|
+
["ubuntu@1.2.3.4"]
|
|
344
|
+
- List of tokens (e.g., split form):
|
|
345
|
+
["ubuntu@1.2.3.4", "-p", "2222"]
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
ssh_connection: The raw field from the API; can be a string or a list
|
|
349
|
+
of strings.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
(ssh_user, ssh_port): username if found, else None; port if found,
|
|
353
|
+
else 22.
|
|
354
|
+
"""
|
|
355
|
+
ssh_user: Optional[str] = None
|
|
356
|
+
ssh_port: int = 22
|
|
357
|
+
|
|
358
|
+
# Normalize into a list of tokens for easier processing.
|
|
359
|
+
tokens: List[str] = []
|
|
360
|
+
if isinstance(ssh_connection, str):
|
|
361
|
+
try:
|
|
362
|
+
tokens = shlex.split(ssh_connection)
|
|
363
|
+
except Exception: # pylint: disable=broad-except
|
|
364
|
+
tokens = [ssh_connection]
|
|
365
|
+
elif isinstance(ssh_connection, list):
|
|
366
|
+
for elem in ssh_connection:
|
|
367
|
+
if isinstance(elem, str):
|
|
368
|
+
try:
|
|
369
|
+
tokens.extend(shlex.split(elem))
|
|
370
|
+
except Exception: # pylint: disable=broad-except
|
|
371
|
+
tokens.append(elem)
|
|
372
|
+
else:
|
|
373
|
+
# Unknown type; return defaults.
|
|
374
|
+
return ssh_user, ssh_port
|
|
375
|
+
|
|
376
|
+
# Find the first token containing '@' as the user@host candidate.
|
|
377
|
+
user_host: Optional[str] = next((t for t in tokens if '@' in t), None)
|
|
378
|
+
if user_host:
|
|
379
|
+
ssh_user = user_host.split('@', 1)[0].strip()
|
|
380
|
+
# Try host:port format (after '@').
|
|
381
|
+
host_part = user_host.split('@', 1)[1]
|
|
382
|
+
if ':' in host_part:
|
|
383
|
+
_, maybe_port = host_part.rsplit(':', 1)
|
|
384
|
+
try:
|
|
385
|
+
ssh_port = int(maybe_port)
|
|
386
|
+
except ValueError:
|
|
387
|
+
pass
|
|
388
|
+
|
|
389
|
+
# Check for '-p <port>' pair anywhere in the tokens. This takes priority.
|
|
390
|
+
if '-p' in tokens:
|
|
391
|
+
idx = tokens.index('-p')
|
|
392
|
+
if idx + 1 < len(tokens):
|
|
393
|
+
try:
|
|
394
|
+
ssh_port = int(tokens[idx + 1])
|
|
395
|
+
except ValueError:
|
|
396
|
+
pass
|
|
397
|
+
|
|
398
|
+
return ssh_user, ssh_port
|
sky/provision/provisioner.py
CHANGED
|
@@ -18,6 +18,7 @@ from sky import exceptions
|
|
|
18
18
|
from sky import global_user_state
|
|
19
19
|
from sky import logs
|
|
20
20
|
from sky import provision
|
|
21
|
+
from sky import resources as resources_lib
|
|
21
22
|
from sky import sky_logging
|
|
22
23
|
from sky import skypilot_config
|
|
23
24
|
from sky.adaptors import aws
|
|
@@ -27,6 +28,7 @@ from sky.provision import common as provision_common
|
|
|
27
28
|
from sky.provision import instance_setup
|
|
28
29
|
from sky.provision import logging as provision_logging
|
|
29
30
|
from sky.provision import metadata_utils
|
|
31
|
+
from sky.provision import volume as provision_volume
|
|
30
32
|
from sky.skylet import constants
|
|
31
33
|
from sky.utils import common
|
|
32
34
|
from sky.utils import common_utils
|
|
@@ -58,6 +60,11 @@ def _bulk_provision(
|
|
|
58
60
|
region_name = region.name
|
|
59
61
|
|
|
60
62
|
start = time.time()
|
|
63
|
+
|
|
64
|
+
provision_volume.provision_ephemeral_volumes(cloud, region_name,
|
|
65
|
+
cluster_name.name_on_cloud,
|
|
66
|
+
bootstrap_config)
|
|
67
|
+
|
|
61
68
|
# TODO(suquark): Should we cache the bootstrapped result?
|
|
62
69
|
# Currently it is not necessary as bootstrapping takes
|
|
63
70
|
# only ~3s, caching it seems over-engineering and could
|
|
@@ -69,6 +76,7 @@ def _bulk_provision(
|
|
|
69
76
|
|
|
70
77
|
provision_record = provision.run_instances(provider_name,
|
|
71
78
|
region_name,
|
|
79
|
+
str(cluster_name),
|
|
72
80
|
cluster_name.name_on_cloud,
|
|
73
81
|
config=config)
|
|
74
82
|
|
|
@@ -149,9 +157,9 @@ def bulk_provision(
|
|
|
149
157
|
logger.debug(f'SkyPilot version: {sky.__version__}; '
|
|
150
158
|
f'commit: {sky.__commit__}')
|
|
151
159
|
logger.debug(_TITLE.format('Provisioning'))
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
160
|
+
redacted_config = bootstrap_config.get_redacted_config()
|
|
161
|
+
logger.debug('Provision config:\n'
|
|
162
|
+
f'{json.dumps(redacted_config, indent=2)}')
|
|
155
163
|
return _bulk_provision(cloud, region, cluster_name,
|
|
156
164
|
bootstrap_config)
|
|
157
165
|
except exceptions.NoClusterLaunchedError:
|
|
@@ -235,6 +243,7 @@ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
|
|
|
235
243
|
provision.terminate_instances(cloud_name, cluster_name.name_on_cloud,
|
|
236
244
|
provider_config)
|
|
237
245
|
metadata_utils.remove_cluster_metadata(cluster_name.name_on_cloud)
|
|
246
|
+
provision_volume.delete_ephemeral_volumes(provider_config)
|
|
238
247
|
else:
|
|
239
248
|
provision.stop_instances(cloud_name, cluster_name.name_on_cloud,
|
|
240
249
|
provider_config)
|
|
@@ -427,18 +436,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
427
436
|
|
|
428
437
|
|
|
429
438
|
def _post_provision_setup(
|
|
430
|
-
|
|
431
|
-
handle_cluster_yaml: str,
|
|
439
|
+
launched_resources: resources_lib.Resources,
|
|
440
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
432
441
|
provision_record: provision_common.ProvisionRecord,
|
|
433
442
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
|
434
443
|
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
|
435
444
|
handle_cluster_yaml)
|
|
436
445
|
provider_config = config_from_yaml.get('provider')
|
|
446
|
+
cloud_name = repr(launched_resources.cloud)
|
|
437
447
|
cluster_info = provision.get_cluster_info(cloud_name,
|
|
438
448
|
provision_record.region,
|
|
439
449
|
cluster_name.name_on_cloud,
|
|
440
450
|
provider_config=provider_config)
|
|
441
451
|
|
|
452
|
+
# Update cluster info in handle so cluster instance ids are set. This
|
|
453
|
+
# allows us to expose provision logs to debug nodes that failed during post
|
|
454
|
+
# provision setup.
|
|
455
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
456
|
+
cluster_name.display_name)
|
|
457
|
+
handle.cached_cluster_info = cluster_info
|
|
458
|
+
global_user_state.update_cluster_handle(cluster_name.display_name, handle)
|
|
459
|
+
|
|
442
460
|
if cluster_info.num_instances > 1:
|
|
443
461
|
# Only worker nodes have logs in the per-instance log directory. Head
|
|
444
462
|
# node's log will be redirected to the main log file.
|
|
@@ -474,12 +492,13 @@ def _post_provision_setup(
|
|
|
474
492
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
475
493
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
476
494
|
# for the users to SSH into the pod.
|
|
477
|
-
|
|
495
|
+
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
496
|
+
if not is_k8s_cloud:
|
|
478
497
|
logger.debug(
|
|
479
498
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
480
499
|
wait_for_ssh(cluster_info, ssh_credentials)
|
|
481
500
|
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
|
482
|
-
vm_str = 'Instance' if
|
|
501
|
+
vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
|
|
483
502
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
|
484
503
|
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
|
485
504
|
indent_str = (ux_utils.INDENT_SYMBOL
|
|
@@ -526,6 +545,7 @@ def _post_provision_setup(
|
|
|
526
545
|
status.update(
|
|
527
546
|
ux_utils.spinner_message(
|
|
528
547
|
'Checking controller version compatibility'))
|
|
548
|
+
|
|
529
549
|
try:
|
|
530
550
|
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
|
531
551
|
except exceptions.ClusterNotUpError:
|
|
@@ -615,10 +635,15 @@ def _post_provision_setup(
|
|
|
615
635
|
status.update(
|
|
616
636
|
runtime_preparation_str.format(step=3, step_name='runtime'))
|
|
617
637
|
|
|
638
|
+
skip_ray_setup = False
|
|
618
639
|
ray_port = constants.SKY_REMOTE_RAY_PORT
|
|
619
640
|
head_ray_needs_restart = True
|
|
620
641
|
ray_cluster_healthy = False
|
|
621
|
-
if (not
|
|
642
|
+
if (launched_resources.cloud is not None and
|
|
643
|
+
not launched_resources.cloud.uses_ray()):
|
|
644
|
+
skip_ray_setup = True
|
|
645
|
+
logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
|
|
646
|
+
elif (not provision_record.is_instance_just_booted(
|
|
622
647
|
head_instance.instance_id)):
|
|
623
648
|
# Check if head node Ray is alive
|
|
624
649
|
(ray_port, ray_cluster_healthy,
|
|
@@ -643,7 +668,9 @@ def _post_provision_setup(
|
|
|
643
668
|
'async setup to complete...')
|
|
644
669
|
time.sleep(1)
|
|
645
670
|
|
|
646
|
-
if
|
|
671
|
+
if skip_ray_setup:
|
|
672
|
+
logger.debug('Skip Ray cluster setup on the head node.')
|
|
673
|
+
elif head_ray_needs_restart:
|
|
647
674
|
logger.debug('Starting Ray on the entire cluster.')
|
|
648
675
|
instance_setup.start_ray_on_head_node(
|
|
649
676
|
cluster_name.name_on_cloud,
|
|
@@ -666,7 +693,9 @@ def _post_provision_setup(
|
|
|
666
693
|
# We don't need to restart ray on worker nodes if the ray cluster is
|
|
667
694
|
# already healthy, i.e. the head node has expected number of nodes
|
|
668
695
|
# connected to the ray cluster.
|
|
669
|
-
if
|
|
696
|
+
if skip_ray_setup:
|
|
697
|
+
logger.debug('Skip Ray cluster setup on the worker nodes.')
|
|
698
|
+
elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
|
|
670
699
|
instance_setup.start_ray_on_worker_nodes(
|
|
671
700
|
cluster_name.name_on_cloud,
|
|
672
701
|
no_restart=not head_ray_needs_restart,
|
|
@@ -692,8 +721,9 @@ def _post_provision_setup(
|
|
|
692
721
|
cluster_info,
|
|
693
722
|
ssh_credentials)
|
|
694
723
|
|
|
695
|
-
instance_setup.start_skylet_on_head_node(cluster_name
|
|
696
|
-
|
|
724
|
+
instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
|
|
725
|
+
ssh_credentials,
|
|
726
|
+
launched_resources)
|
|
697
727
|
|
|
698
728
|
logger.info(
|
|
699
729
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
@@ -704,8 +734,8 @@ def _post_provision_setup(
|
|
|
704
734
|
|
|
705
735
|
@timeline.event
|
|
706
736
|
def post_provision_runtime_setup(
|
|
707
|
-
|
|
708
|
-
handle_cluster_yaml: str,
|
|
737
|
+
launched_resources: resources_lib.Resources,
|
|
738
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
709
739
|
provision_record: provision_common.ProvisionRecord,
|
|
710
740
|
custom_resource: Optional[str],
|
|
711
741
|
log_dir: str) -> provision_common.ClusterInfo:
|
|
@@ -726,7 +756,7 @@ def post_provision_runtime_setup(
|
|
|
726
756
|
try:
|
|
727
757
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
|
728
758
|
return _post_provision_setup(
|
|
729
|
-
|
|
759
|
+
launched_resources,
|
|
730
760
|
cluster_name,
|
|
731
761
|
handle_cluster_yaml=handle_cluster_yaml,
|
|
732
762
|
provision_record=provision_record,
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
|
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
12
|
from sky.provision.runpod.volume import apply_volume
|
|
13
13
|
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
14
15
|
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return head_instance_id
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
-
|
|
50
|
+
del cluster_name # unused
|
|
51
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
52
52
|
|
|
53
53
|
while True:
|
|
@@ -222,9 +222,10 @@ def query_instances(
|
|
|
222
222
|
cluster_name_on_cloud: str,
|
|
223
223
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
224
224
|
non_terminated_only: bool = True,
|
|
225
|
+
retry_if_missing: bool = False,
|
|
225
226
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
226
227
|
"""See sky/provision/__init__.py"""
|
|
227
|
-
del cluster_name # unused
|
|
228
|
+
del cluster_name, retry_if_missing # unused
|
|
228
229
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
229
230
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
230
231
|
|