skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/scp/instance.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
"""SCP instance provisioning."""
|
|
2
2
|
|
|
3
|
+
from concurrent.futures import as_completed
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
import hashlib
|
|
3
7
|
import logging
|
|
4
8
|
import random
|
|
5
9
|
import string
|
|
@@ -13,25 +17,29 @@ from sky.utils import status_lib
|
|
|
13
17
|
logger = logging.getLogger(__name__)
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
20
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
17
21
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
18
|
-
|
|
22
|
+
del cluster_name # unused
|
|
19
23
|
zone_id = config.node_config['zone_id']
|
|
24
|
+
|
|
20
25
|
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
21
|
-
head_instance_id = _get_head_instance_id(running_instances)
|
|
22
26
|
|
|
23
27
|
to_start_count = config.count - len(running_instances)
|
|
28
|
+
|
|
24
29
|
if to_start_count < 0:
|
|
25
30
|
raise RuntimeError(
|
|
26
31
|
f'Cluster {cluster_name_on_cloud} already has '
|
|
27
|
-
f'{len(running_instances)}
|
|
32
|
+
f'{len(running_instances)} instances, but {config.count} '
|
|
33
|
+
'are required')
|
|
28
34
|
|
|
29
35
|
if to_start_count == 0:
|
|
36
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
|
30
37
|
if head_instance_id is None:
|
|
31
38
|
raise RuntimeError(
|
|
32
|
-
f'Cluster {cluster_name_on_cloud} has no head
|
|
33
|
-
logger.info(
|
|
34
|
-
|
|
39
|
+
f'Cluster {cluster_name_on_cloud} has no head instance')
|
|
40
|
+
logger.info(
|
|
41
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
|
42
|
+
f'{len(running_instances)} instances, no need to start more')
|
|
35
43
|
return common.ProvisionRecord(provider_name='scp',
|
|
36
44
|
cluster_name=cluster_name_on_cloud,
|
|
37
45
|
region=region,
|
|
@@ -40,64 +48,192 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
40
48
|
resumed_instance_ids=[],
|
|
41
49
|
created_instance_ids=[])
|
|
42
50
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
51
|
+
existing_instances = _filter_instances(cluster_name_on_cloud, None)
|
|
52
|
+
stopped_instances = _filter_instances(cluster_name_on_cloud,
|
|
53
|
+
['STOPPED', 'STOPPING'])
|
|
54
|
+
|
|
55
|
+
def _detect_naming_version(existing_instances,
|
|
56
|
+
cluster_name_on_cloud) -> str:
|
|
57
|
+
v2_head = _head(cluster_name_on_cloud)
|
|
58
|
+
v2_worker_prefix = _worker(cluster_name_on_cloud)
|
|
59
|
+
has_v2 = any(instance['virtualServerName'] == v2_head or
|
|
60
|
+
instance['virtualServerName'].startswith(v2_worker_prefix)
|
|
61
|
+
for instance in existing_instances)
|
|
62
|
+
if has_v2:
|
|
63
|
+
return 'v2'
|
|
64
|
+
has_v1 = any(instance['virtualServerName'] == cluster_name_on_cloud
|
|
65
|
+
for instance in existing_instances)
|
|
66
|
+
if has_v1:
|
|
67
|
+
return 'v1'
|
|
68
|
+
|
|
69
|
+
if not existing_instances:
|
|
70
|
+
logger.debug(
|
|
71
|
+
'detect_naming_version: no instances for cluster %s; '
|
|
72
|
+
'defaulting to v2.', cluster_name_on_cloud)
|
|
73
|
+
else:
|
|
74
|
+
logger.error(
|
|
75
|
+
'detect_naming_version: unexpected instance names for cluster '
|
|
76
|
+
'%s: %s; defaulting to v2.', cluster_name_on_cloud, [
|
|
77
|
+
instance['virtualServerName']
|
|
78
|
+
for instance in existing_instances
|
|
79
|
+
])
|
|
80
|
+
return 'v2'
|
|
81
|
+
|
|
82
|
+
naming_version = _detect_naming_version(existing_instances,
|
|
83
|
+
cluster_name_on_cloud)
|
|
84
|
+
|
|
85
|
+
if naming_version == 'v2':
|
|
86
|
+
cluster_instance_names = [_head(cluster_name_on_cloud)] + [
|
|
87
|
+
f'{_worker(cluster_name_on_cloud)}-{i:02d}'
|
|
88
|
+
for i in range(1, config.count)
|
|
89
|
+
]
|
|
90
|
+
else:
|
|
91
|
+
if config.count > 1:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
'This cluster uses the legacy naming scheme and cannot be '
|
|
94
|
+
'scaled to multi-node automatically. '
|
|
95
|
+
'Please `sky down` and relaunch.')
|
|
96
|
+
cluster_instance_names = [cluster_name_on_cloud]
|
|
97
|
+
|
|
98
|
+
existing_instance_names = [
|
|
99
|
+
instance['virtualServerName'] for instance in existing_instances
|
|
100
|
+
]
|
|
101
|
+
resume_instance_names = [
|
|
102
|
+
instance['virtualServerName'] for instance in stopped_instances
|
|
103
|
+
]
|
|
104
|
+
create_instance_names = [
|
|
105
|
+
instance_name for instance_name in cluster_instance_names
|
|
106
|
+
if instance_name not in existing_instance_names
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
vpc_subnets = _get_or_create_vpc_subnets(zone_id)
|
|
110
|
+
|
|
111
|
+
def _resume(instance_name):
|
|
112
|
+
instance_id = _get_instance_id(instance_name, cluster_name_on_cloud)
|
|
47
113
|
while True:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if
|
|
114
|
+
state = scp_utils.SCPClient().get_instance_info(
|
|
115
|
+
instance_id)['virtualServerState']
|
|
116
|
+
if state == 'RUNNING':
|
|
117
|
+
return instance_id, 'resumed'
|
|
118
|
+
if state == 'STOPPED':
|
|
51
119
|
break
|
|
52
120
|
time.sleep(2)
|
|
53
|
-
resumed_instance_ids = [head_instance_id]
|
|
54
|
-
return common.ProvisionRecord(provider_name='scp',
|
|
55
|
-
cluster_name=cluster_name_on_cloud,
|
|
56
|
-
region=region,
|
|
57
|
-
zone=None,
|
|
58
|
-
head_instance_id=head_instance_id,
|
|
59
|
-
resumed_instance_ids=resumed_instance_ids,
|
|
60
|
-
created_instance_ids=[])
|
|
61
121
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
122
|
+
scp_utils.SCPClient().start_instance(instance_id)
|
|
123
|
+
while True:
|
|
124
|
+
info = scp_utils.SCPClient().get_instance_info(instance_id)
|
|
125
|
+
if info['virtualServerState'] == 'RUNNING':
|
|
126
|
+
return instance_id, 'resumed'
|
|
127
|
+
time.sleep(2)
|
|
65
128
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
129
|
+
def _create(instance_name):
|
|
130
|
+
instance_config = deepcopy(config.docker_config)
|
|
131
|
+
instance_config['virtualServerName'] = instance_name
|
|
132
|
+
cnt = config.count
|
|
133
|
+
|
|
134
|
+
for vpc, subnets in vpc_subnets.items():
|
|
135
|
+
sg_id = _create_security_group(zone_id, vpc, cnt)
|
|
136
|
+
if not sg_id:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
created_in_this_vpc = False
|
|
140
|
+
try:
|
|
141
|
+
instance_config['securityGroupIds'] = [sg_id]
|
|
142
|
+
for subnet in subnets:
|
|
143
|
+
instance_config['nic']['subnetId'] = subnet
|
|
144
|
+
instance_id = _create_instance(vpc, instance_config, cnt)
|
|
145
|
+
if instance_id:
|
|
146
|
+
created_in_this_vpc = True
|
|
147
|
+
return instance_id, 'created'
|
|
148
|
+
except Exception as e: # pylint: disable=broad-except
|
|
149
|
+
logger.error(f'run_instances error ({instance_name}): {e}')
|
|
150
|
+
finally:
|
|
151
|
+
if not created_in_this_vpc:
|
|
152
|
+
try:
|
|
153
|
+
_delete_security_group(sg_id)
|
|
154
|
+
except Exception: # pylint: disable=broad-except
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
raise RuntimeError(f'instance creation error: {instance_name}')
|
|
158
|
+
|
|
159
|
+
tasks = (
|
|
160
|
+
[(_resume, instance_name) for instance_name in resume_instance_names] +
|
|
161
|
+
[(_create, instance_name) for instance_name in create_instance_names])
|
|
162
|
+
|
|
163
|
+
instance_ids_statuses = []
|
|
164
|
+
if tasks:
|
|
165
|
+
with ThreadPoolExecutor(max_workers=min(len(tasks), 32)) as ex:
|
|
166
|
+
execution = [
|
|
167
|
+
ex.submit(function, instance_name)
|
|
168
|
+
for function, instance_name in tasks
|
|
169
|
+
]
|
|
170
|
+
for e in as_completed(execution):
|
|
171
|
+
try:
|
|
172
|
+
instance_ids_statuses.append(e.result())
|
|
173
|
+
except Exception as e: # pylint: disable=broad-except
|
|
174
|
+
logger.error(f'run_instances error: {e}')
|
|
175
|
+
|
|
176
|
+
wait_time = time.time() + 600
|
|
177
|
+
while time.time() < wait_time:
|
|
178
|
+
running_instances = _filter_instances(cluster_name_on_cloud,
|
|
179
|
+
['RUNNING'])
|
|
180
|
+
if len(running_instances) == config.count:
|
|
181
|
+
break
|
|
182
|
+
pending_instances = _filter_instances(
|
|
183
|
+
cluster_name_on_cloud,
|
|
184
|
+
['CREATING', 'EDITING', 'STARTING', 'RESTARTING', 'STOPPING'])
|
|
185
|
+
if not pending_instances:
|
|
186
|
+
break
|
|
187
|
+
time.sleep(3)
|
|
83
188
|
|
|
84
|
-
|
|
85
|
-
|
|
189
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
190
|
+
if len(running_instances) != config.count:
|
|
191
|
+
raise RuntimeError(f'Expected {config.count} running instances, '
|
|
192
|
+
f'but got {len(running_instances)} instances')
|
|
86
193
|
|
|
194
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
|
87
195
|
if head_instance_id is None:
|
|
88
|
-
|
|
196
|
+
raise RuntimeError('Head instance is not running')
|
|
89
197
|
|
|
90
|
-
|
|
198
|
+
resumed_instance_ids = []
|
|
199
|
+
created_instance_ids = []
|
|
200
|
+
for instance_id, status in instance_ids_statuses:
|
|
201
|
+
if status == 'resumed':
|
|
202
|
+
resumed_instance_ids.append(instance_id)
|
|
203
|
+
elif status == 'created':
|
|
204
|
+
created_instance_ids.append(instance_id)
|
|
91
205
|
|
|
92
206
|
return common.ProvisionRecord(provider_name='scp',
|
|
93
207
|
cluster_name=cluster_name_on_cloud,
|
|
94
208
|
region=region,
|
|
95
209
|
zone=None,
|
|
96
210
|
head_instance_id=head_instance_id,
|
|
97
|
-
resumed_instance_ids=
|
|
211
|
+
resumed_instance_ids=resumed_instance_ids,
|
|
98
212
|
created_instance_ids=created_instance_ids)
|
|
99
213
|
|
|
100
214
|
|
|
215
|
+
def _head(cluster_name_on_cloud: str):
|
|
216
|
+
return (f'{cluster_name_on_cloud[:8]}-'
|
|
217
|
+
f'{_suffix(cluster_name_on_cloud)}-head')
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _worker(cluster_name_on_cloud: str):
|
|
221
|
+
return (f'{cluster_name_on_cloud[:8]}-'
|
|
222
|
+
f'{_suffix(cluster_name_on_cloud)}-worker')
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _suffix(name: str, n: int = 5):
|
|
226
|
+
return hashlib.sha1(name.encode()).hexdigest()[:n]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _get_instance_id(instance_name, cluster_name_on_cloud):
|
|
230
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
231
|
+
for instance in instances:
|
|
232
|
+
if instance_name == instance['virtualServerName']:
|
|
233
|
+
return instance['virtualServerId']
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
|
|
101
237
|
def _get_or_create_vpc_subnets(zone_id):
|
|
102
238
|
while len(_get_vcp_subnets(zone_id)) == 0:
|
|
103
239
|
try:
|
|
@@ -182,28 +318,36 @@ def _get_vcp_subnets(zone_id):
|
|
|
182
318
|
def _filter_instances(cluster_name_on_cloud,
|
|
183
319
|
status_filter: Optional[List[str]]):
|
|
184
320
|
instances = scp_utils.SCPClient().get_instances()
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
321
|
+
v2_head_instance_name = _head(cluster_name_on_cloud)
|
|
322
|
+
v2_worker_prefix = _worker(cluster_name_on_cloud)
|
|
323
|
+
v1_head_instance_name = cluster_name_on_cloud
|
|
324
|
+
|
|
325
|
+
cluster_instances = [
|
|
326
|
+
instance for instance in instances
|
|
327
|
+
if instance['virtualServerName'] == v2_head_instance_name or
|
|
328
|
+
instance['virtualServerName'].startswith(v2_worker_prefix) or
|
|
329
|
+
instance['virtualServerName'] == v1_head_instance_name
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
if status_filter is None:
|
|
333
|
+
return cluster_instances
|
|
334
|
+
return [
|
|
335
|
+
instance for instance in cluster_instances
|
|
336
|
+
if instance['virtualServerState'] in status_filter
|
|
337
|
+
]
|
|
195
338
|
|
|
196
339
|
|
|
197
340
|
def _get_head_instance_id(instances):
|
|
198
|
-
head_instance_id = None
|
|
199
341
|
if len(instances) > 0:
|
|
200
|
-
|
|
201
|
-
|
|
342
|
+
for instance in instances:
|
|
343
|
+
if instance['virtualServerName'].endswith('-head'):
|
|
344
|
+
return instance['virtualServerId']
|
|
345
|
+
return instances[0]['virtualServerId']
|
|
346
|
+
return None
|
|
202
347
|
|
|
203
348
|
|
|
204
|
-
def _create_security_group(zone_id, vpc):
|
|
349
|
+
def _create_security_group(zone_id, vpc, cnt):
|
|
205
350
|
sg_name = 'sky' + ''.join(random.choices(string.ascii_lowercase, k=8))
|
|
206
|
-
|
|
207
351
|
undo_func_stack = []
|
|
208
352
|
try:
|
|
209
353
|
response = scp_utils.SCPClient().create_security_group(
|
|
@@ -222,8 +366,8 @@ def _create_security_group(zone_id, vpc):
|
|
|
222
366
|
break
|
|
223
367
|
time.sleep(5)
|
|
224
368
|
|
|
225
|
-
scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', None)
|
|
226
|
-
scp_utils.SCPClient().add_security_group_rule(sg_id, 'OUT', None)
|
|
369
|
+
scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', None, cnt)
|
|
370
|
+
scp_utils.SCPClient().add_security_group_rule(sg_id, 'OUT', None, cnt)
|
|
227
371
|
|
|
228
372
|
return sg_id
|
|
229
373
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -252,7 +396,7 @@ def _undo_functions(undo_func_list):
|
|
|
252
396
|
func()
|
|
253
397
|
|
|
254
398
|
|
|
255
|
-
def _create_instance(vpc_id, instance_config):
|
|
399
|
+
def _create_instance(vpc_id, instance_config, cnt):
|
|
256
400
|
undo_func_stack = []
|
|
257
401
|
try:
|
|
258
402
|
instance = scp_utils.SCPClient().create_instance(instance_config)
|
|
@@ -265,10 +409,12 @@ def _create_instance(vpc_id, instance_config):
|
|
|
265
409
|
undo_func_stack.append(lambda: _delete_instance(instance_id))
|
|
266
410
|
firewall_id = _get_firewall_id(vpc_id)
|
|
267
411
|
internal_ip = instance_info['ip']
|
|
268
|
-
in_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'IN', None
|
|
412
|
+
in_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'IN', None,
|
|
413
|
+
cnt)
|
|
269
414
|
undo_func_stack.append(
|
|
270
415
|
lambda: _delete_firewall_rule(firewall_id, in_rule_id))
|
|
271
|
-
out_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'OUT', None
|
|
416
|
+
out_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'OUT', None,
|
|
417
|
+
cnt)
|
|
272
418
|
undo_func_stack.append(
|
|
273
419
|
lambda: _delete_firewall_rule(firewall_id, out_rule_id))
|
|
274
420
|
return instance_id
|
|
@@ -305,20 +451,22 @@ def _get_firewall_id(vpc_id):
|
|
|
305
451
|
|
|
306
452
|
|
|
307
453
|
def _add_firewall_rule(firewall_id, internal_ip, direction,
|
|
308
|
-
ports: Optional[List[str]]):
|
|
454
|
+
ports: Optional[List[str]], cnt: Optional[int]):
|
|
309
455
|
attempts = 0
|
|
310
456
|
max_attempts = 300
|
|
311
|
-
|
|
312
457
|
while attempts < max_attempts:
|
|
313
458
|
try:
|
|
314
459
|
rule_info = scp_utils.SCPClient().add_firewall_rule(
|
|
315
|
-
firewall_id, internal_ip, direction, ports)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
460
|
+
firewall_id, internal_ip, direction, ports, cnt)
|
|
461
|
+
if rule_info is not None:
|
|
462
|
+
rule_id = rule_info['resourceId']
|
|
463
|
+
while True:
|
|
464
|
+
rule_info = scp_utils.SCPClient().get_firewall_rule_info(
|
|
465
|
+
firewall_id, rule_id)
|
|
466
|
+
if rule_info['ruleState'] == 'ACTIVE':
|
|
467
|
+
return rule_id
|
|
468
|
+
else:
|
|
469
|
+
return None
|
|
322
470
|
except Exception as e: # pylint: disable=broad-except
|
|
323
471
|
attempts += 1
|
|
324
472
|
time.sleep(10)
|
|
@@ -330,13 +478,12 @@ def _add_firewall_rule(firewall_id, internal_ip, direction,
|
|
|
330
478
|
def _delete_firewall_rule(firewall_id, rule_ids):
|
|
331
479
|
if not isinstance(rule_ids, list):
|
|
332
480
|
rule_ids = [rule_ids]
|
|
333
|
-
|
|
334
481
|
attempts = 0
|
|
335
482
|
max_attempts = 300
|
|
336
483
|
while attempts < max_attempts:
|
|
337
484
|
try:
|
|
338
485
|
scp_utils.SCPClient().delete_firewall_rule(firewall_id, rule_ids)
|
|
339
|
-
if _remaining_firewall_rule(firewall_id, rule_ids)
|
|
486
|
+
if not _remaining_firewall_rule(firewall_id, rule_ids):
|
|
340
487
|
return
|
|
341
488
|
except Exception as e: # pylint: disable=broad-except
|
|
342
489
|
attempts += 1
|
|
@@ -385,19 +532,35 @@ def stop_instances(
|
|
|
385
532
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
386
533
|
worker_only: bool = False,
|
|
387
534
|
) -> None:
|
|
388
|
-
del provider_config
|
|
389
|
-
instances =
|
|
535
|
+
del provider_config
|
|
536
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
390
537
|
|
|
391
|
-
|
|
392
|
-
|
|
538
|
+
if worker_only:
|
|
539
|
+
head_instance_name = _head(cluster_name_on_cloud)
|
|
540
|
+
instances = [
|
|
541
|
+
instance for instance in instances
|
|
542
|
+
if instance['virtualServerName'] != head_instance_name
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
if not instances:
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
def _stop(instance):
|
|
549
|
+
try:
|
|
393
550
|
instance_id = instance['virtualServerId']
|
|
394
551
|
scp_utils.SCPClient().stop_instance(instance_id)
|
|
395
552
|
while True:
|
|
396
|
-
|
|
397
|
-
|
|
553
|
+
info = scp_utils.SCPClient().get_instance_info(instance_id)
|
|
554
|
+
if info['virtualServerState'] == 'STOPPED':
|
|
555
|
+
return instance_id
|
|
398
556
|
time.sleep(2)
|
|
399
|
-
|
|
400
|
-
|
|
557
|
+
except Exception as e: # pylint: disable=broad-except
|
|
558
|
+
logger.error(f'stop_instances error: {e}')
|
|
559
|
+
|
|
560
|
+
with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
|
|
561
|
+
execution = [ex.submit(_stop, instance) for instance in instances]
|
|
562
|
+
for e in as_completed(execution):
|
|
563
|
+
e.result()
|
|
401
564
|
|
|
402
565
|
|
|
403
566
|
def terminate_instances(
|
|
@@ -405,25 +568,37 @@ def terminate_instances(
|
|
|
405
568
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
406
569
|
worker_only: bool = False,
|
|
407
570
|
) -> None:
|
|
408
|
-
del provider_config
|
|
409
|
-
instances =
|
|
571
|
+
del provider_config
|
|
572
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING', 'STOPPED'])
|
|
410
573
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
574
|
+
if worker_only:
|
|
575
|
+
head_instance_name = _head(cluster_name_on_cloud)
|
|
576
|
+
instances = [
|
|
577
|
+
instance for instance in instances
|
|
578
|
+
if instance['virtualServerName'] != head_instance_name
|
|
579
|
+
]
|
|
580
|
+
|
|
581
|
+
if not instances:
|
|
582
|
+
return
|
|
583
|
+
|
|
584
|
+
def _terminate(instance):
|
|
585
|
+
try:
|
|
586
|
+
instance_id = instance['virtualServerId']
|
|
587
|
+
instance_info = scp_utils.SCPClient().get_instance_info(instance_id)
|
|
588
|
+
vpc_id = instance_info['vpcId']
|
|
589
|
+
sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
|
|
590
|
+
firewall_id = _get_firewall_id(vpc_id)
|
|
591
|
+
rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, None)
|
|
592
|
+
_delete_firewall_rule(firewall_id, rule_ids)
|
|
593
|
+
_delete_instance(instance_id)
|
|
594
|
+
_delete_security_group(sg_id)
|
|
595
|
+
except Exception as e: # pylint: disable=broad-except
|
|
596
|
+
logger.error(f'terminate_instances error: {e}')
|
|
597
|
+
|
|
598
|
+
with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
|
|
599
|
+
execution = [ex.submit(_terminate, instance) for instance in instances]
|
|
600
|
+
for e in as_completed(execution):
|
|
601
|
+
e.result()
|
|
427
602
|
|
|
428
603
|
|
|
429
604
|
def query_instances(
|
|
@@ -431,8 +606,9 @@ def query_instances(
|
|
|
431
606
|
cluster_name_on_cloud: str,
|
|
432
607
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
433
608
|
non_terminated_only: bool = True,
|
|
609
|
+
retry_if_missing: bool = False,
|
|
434
610
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
|
-
del cluster_name # unused
|
|
611
|
+
del cluster_name, retry_if_missing # unused
|
|
436
612
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
437
613
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
438
614
|
|
|
@@ -467,7 +643,6 @@ def get_cluster_info(
|
|
|
467
643
|
cluster_name_on_cloud: str,
|
|
468
644
|
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
469
645
|
del region
|
|
470
|
-
|
|
471
646
|
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
472
647
|
head_instance_id = _get_head_instance_id(running_instances)
|
|
473
648
|
|
|
@@ -482,9 +657,19 @@ def get_cluster_info(
|
|
|
482
657
|
tags={})
|
|
483
658
|
]
|
|
484
659
|
|
|
660
|
+
# max-worker-port - min-worker-port should be at least 3 * nproc
|
|
661
|
+
# RAY_worker_maximum_startup_concurrency for the performance
|
|
662
|
+
custom_ray_options = {
|
|
663
|
+
'node-manager-port': 11001,
|
|
664
|
+
'min-worker-port': 11002,
|
|
665
|
+
'max-worker-port': 11200,
|
|
666
|
+
'ray-client-server-port': 10001
|
|
667
|
+
}
|
|
668
|
+
|
|
485
669
|
return common.ClusterInfo(
|
|
486
670
|
instances=instances,
|
|
487
671
|
head_instance_id=head_instance_id,
|
|
672
|
+
custom_ray_options=custom_ray_options,
|
|
488
673
|
provider_name='scp',
|
|
489
674
|
provider_config=provider_config,
|
|
490
675
|
)
|
|
@@ -495,20 +680,16 @@ def open_ports(
|
|
|
495
680
|
ports: List[str],
|
|
496
681
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
497
682
|
) -> None:
|
|
498
|
-
|
|
499
683
|
del provider_config
|
|
500
|
-
instances =
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
internal_ip = instance_info['ip']
|
|
510
|
-
firewall_id = _get_firewall_id(vpc_id)
|
|
511
|
-
_add_firewall_rule(firewall_id, internal_ip, 'IN', ports)
|
|
684
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
685
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
686
|
+
instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
|
|
687
|
+
sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
|
|
688
|
+
scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', ports, None)
|
|
689
|
+
vpc_id = instance_info['vpcId']
|
|
690
|
+
internal_ip = instance_info['ip']
|
|
691
|
+
firewall_id = _get_firewall_id(vpc_id)
|
|
692
|
+
_add_firewall_rule(firewall_id, internal_ip, 'IN', ports, None)
|
|
512
693
|
|
|
513
694
|
|
|
514
695
|
def cleanup_ports(
|
|
@@ -516,15 +697,11 @@ def cleanup_ports(
|
|
|
516
697
|
ports: List[str],
|
|
517
698
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
518
699
|
) -> None:
|
|
519
|
-
|
|
520
700
|
del provider_config
|
|
521
|
-
instances =
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
firewall_id = _get_firewall_id(vpc_id)
|
|
529
|
-
rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, ports)
|
|
530
|
-
_delete_firewall_rule(firewall_id, rule_ids)
|
|
701
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
702
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
703
|
+
instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
|
|
704
|
+
vpc_id = instance_info['vpcId']
|
|
705
|
+
firewall_id = _get_firewall_id(vpc_id)
|
|
706
|
+
rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, ports)
|
|
707
|
+
_delete_firewall_rule(firewall_id, rule_ids)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Seeweb provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.seeweb.config import bootstrap_instances
|
|
4
|
+
from sky.provision.seeweb.instance import cleanup_ports
|
|
5
|
+
from sky.provision.seeweb.instance import get_cluster_info
|
|
6
|
+
from sky.provision.seeweb.instance import open_ports
|
|
7
|
+
from sky.provision.seeweb.instance import query_instances
|
|
8
|
+
from sky.provision.seeweb.instance import run_instances
|
|
9
|
+
from sky.provision.seeweb.instance import stop_instances
|
|
10
|
+
from sky.provision.seeweb.instance import terminate_instances
|
|
11
|
+
from sky.provision.seeweb.instance import wait_instances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Configuration for Seeweb provisioning."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(*args, **_kwargs) -> Dict[str, Any]:
|
|
7
|
+
"""Bootstrap instances for Seeweb.
|
|
8
|
+
|
|
9
|
+
Seeweb doesn't require any special configuration bootstrapping,
|
|
10
|
+
so we just return the config as-is.
|
|
11
|
+
"""
|
|
12
|
+
config = args[2]
|
|
13
|
+
return config
|