skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/provision/aws/config.py
CHANGED
|
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
305
305
|
Returns:
|
|
306
306
|
A list of route tables associated with the options VPC and region
|
|
307
307
|
"""
|
|
308
|
-
filters
|
|
308
|
+
filters: List['ec2_type_defs.FilterTypeDef'] = [{
|
|
309
|
+
'Name': 'association.main',
|
|
310
|
+
'Values': [str(main).lower()],
|
|
311
|
+
}]
|
|
309
312
|
if vpc_id is not None:
|
|
310
313
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
|
311
314
|
logger.debug(
|
|
@@ -406,10 +409,26 @@ def _usable_subnets(
|
|
|
406
409
|
s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
|
|
407
410
|
]
|
|
408
411
|
|
|
412
|
+
if not candidate_subnets:
|
|
413
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
414
|
+
'No candidate subnets found in specified VPC '
|
|
415
|
+
f'{vpc_id_of_sg}.')
|
|
416
|
+
|
|
409
417
|
available_subnets = [
|
|
410
418
|
s for s in candidate_subnets if s.state == 'available'
|
|
411
419
|
]
|
|
412
420
|
|
|
421
|
+
if not available_subnets:
|
|
422
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
423
|
+
'All candidate subnets are pending in specified VPC '
|
|
424
|
+
f'{vpc_id_of_sg}.')
|
|
425
|
+
|
|
426
|
+
if len(candidate_subnets) > len(available_subnets):
|
|
427
|
+
num_pruned = len(candidate_subnets) - len(available_subnets)
|
|
428
|
+
logger.debug(
|
|
429
|
+
f'{num_pruned} candidate subnets pruned since they are not '
|
|
430
|
+
'available.')
|
|
431
|
+
|
|
413
432
|
if use_internal_ips:
|
|
414
433
|
# Get private subnets.
|
|
415
434
|
#
|
|
@@ -421,6 +440,10 @@ def _usable_subnets(
|
|
|
421
440
|
if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
|
|
422
441
|
not s.map_public_ip_on_launch
|
|
423
442
|
]
|
|
443
|
+
if not subnets:
|
|
444
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
445
|
+
'The use_internal_ips option is set to True, but all '
|
|
446
|
+
'candidate subnets are public.')
|
|
424
447
|
else:
|
|
425
448
|
# Get public subnets.
|
|
426
449
|
#
|
|
@@ -436,6 +459,10 @@ def _usable_subnets(
|
|
|
436
459
|
s for s in available_subnets
|
|
437
460
|
if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
|
|
438
461
|
]
|
|
462
|
+
if not subnets:
|
|
463
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
464
|
+
'All candidate subnets are private, did you mean to '
|
|
465
|
+
'set use_internal_ips to True?')
|
|
439
466
|
|
|
440
467
|
subnets = sorted(
|
|
441
468
|
subnets,
|
|
@@ -449,18 +476,7 @@ def _usable_subnets(
|
|
|
449
476
|
'Failed to fetch available subnets from AWS.')
|
|
450
477
|
raise exc
|
|
451
478
|
|
|
452
|
-
if
|
|
453
|
-
vpc_msg = (f'Does a default VPC exist in region '
|
|
454
|
-
f'{ec2.meta.client.meta.region_name}? ') if (
|
|
455
|
-
vpc_id_of_sg is None) else ''
|
|
456
|
-
_skypilot_log_error_and_exit_for_failover(
|
|
457
|
-
f'No usable subnets found. {vpc_msg}'
|
|
458
|
-
'Try manually creating an instance in your specified region to '
|
|
459
|
-
'populate the list of subnets and try again. '
|
|
460
|
-
'Note that the subnet must map public IPs '
|
|
461
|
-
'on instance launch unless you set `use_internal_ips: true` in '
|
|
462
|
-
'the `provider` config.')
|
|
463
|
-
elif _are_user_subnets_pruned(subnets):
|
|
479
|
+
if _are_user_subnets_pruned(subnets):
|
|
464
480
|
_skypilot_log_error_and_exit_for_failover(
|
|
465
481
|
f'The specified subnets are not '
|
|
466
482
|
f'usable: {_get_pruned_subnets(subnets)}')
|
|
@@ -579,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
579
595
|
# not want SkyPilot to use.
|
|
580
596
|
if vpc_id_of_sg is None:
|
|
581
597
|
all_subnets = [s for s in all_subnets if s.vpc.is_default]
|
|
598
|
+
if not all_subnets:
|
|
599
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
600
|
+
f'The default VPC in {region} either does not exist or '
|
|
601
|
+
'has no subnets.')
|
|
602
|
+
|
|
582
603
|
subnets, vpc_id = _usable_subnets(
|
|
583
604
|
ec2,
|
|
584
605
|
user_specified_subnets=None,
|
sky/provision/aws/instance.py
CHANGED
|
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
|
|
|
311
311
|
return head_instance_id
|
|
312
312
|
|
|
313
313
|
|
|
314
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
314
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
315
315
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
316
316
|
"""See sky/provision/__init__.py"""
|
|
317
|
+
del cluster_name # unused
|
|
317
318
|
ec2 = _default_ec2_resource(region)
|
|
318
319
|
# NOTE: We set max_attempts=0 for fast failing when the resource is not
|
|
319
320
|
# available (although the doc says it will only retry for network
|
|
@@ -629,9 +630,10 @@ def query_instances(
|
|
|
629
630
|
cluster_name_on_cloud: str,
|
|
630
631
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
631
632
|
non_terminated_only: bool = True,
|
|
633
|
+
retry_if_missing: bool = False,
|
|
632
634
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
633
635
|
"""See sky/provision/__init__.py"""
|
|
634
|
-
del cluster_name # unused
|
|
636
|
+
del cluster_name, retry_if_missing # unused
|
|
635
637
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
636
638
|
region = provider_config['region']
|
|
637
639
|
ec2 = _default_ec2_resource(region)
|
|
@@ -743,6 +745,7 @@ def terminate_instances(
|
|
|
743
745
|
|
|
744
746
|
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
745
747
|
def modify_instance_sg(instance):
|
|
748
|
+
assert default_sg is not None # Type narrowing for mypy
|
|
746
749
|
instance.modify_attribute(Groups=[default_sg.id])
|
|
747
750
|
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
748
751
|
f'{default_sg.id} for quick deletion.')
|
sky/provision/azure/instance.py
CHANGED
|
@@ -214,7 +214,7 @@ def _create_network_interface(
|
|
|
214
214
|
location=provider_config['location'],
|
|
215
215
|
public_ip_allocation_method='Static',
|
|
216
216
|
public_ip_address_version='IPv4',
|
|
217
|
-
sku=network.PublicIPAddressSku(name='
|
|
217
|
+
sku=network.PublicIPAddressSku(name='Standard', tier='Regional'))
|
|
218
218
|
ip_poller = network_client.public_ip_addresses.begin_create_or_update(
|
|
219
219
|
resource_group_name=provider_config['resource_group'],
|
|
220
220
|
public_ip_address_name=f'{vm_name}-ip',
|
|
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
|
362
362
|
return instances
|
|
363
363
|
|
|
364
364
|
|
|
365
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
365
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
366
366
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
367
367
|
"""See sky/provision/__init__.py"""
|
|
368
|
+
del cluster_name # unused
|
|
368
369
|
# TODO(zhwu): This function is too long. We should refactor it.
|
|
369
370
|
provider_config = config.provider_config
|
|
370
371
|
resource_group = provider_config['resource_group']
|
|
@@ -956,9 +957,10 @@ def query_instances(
|
|
|
956
957
|
cluster_name_on_cloud: str,
|
|
957
958
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
958
959
|
non_terminated_only: bool = True,
|
|
960
|
+
retry_if_missing: bool = False,
|
|
959
961
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
960
962
|
"""See sky/provision/__init__.py"""
|
|
961
|
-
del cluster_name # unused
|
|
963
|
+
del cluster_name, retry_if_missing # unused
|
|
962
964
|
assert provider_config is not None, cluster_name_on_cloud
|
|
963
965
|
|
|
964
966
|
subscription_id = provider_config['subscription_id']
|
sky/provision/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
|
+
from sky.utils import config_utils
|
|
9
10
|
from sky.utils import env_options
|
|
10
11
|
from sky.utils import resources_utils
|
|
11
12
|
|
|
@@ -36,6 +37,13 @@ class StopFailoverError(Exception):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
# These fields are sensitive and should be redacted from the config for logging
|
|
41
|
+
# purposes.
|
|
42
|
+
SENSITIVE_FIELDS = [
|
|
43
|
+
('docker_config', 'docker_login_config', 'password'),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
39
47
|
@dataclasses.dataclass
|
|
40
48
|
class ProvisionConfig:
|
|
41
49
|
"""Configuration for provisioning."""
|
|
@@ -56,6 +64,18 @@ class ProvisionConfig:
|
|
|
56
64
|
# Optional ports to open on launch of the cluster.
|
|
57
65
|
ports_to_open_on_launch: Optional[List[int]]
|
|
58
66
|
|
|
67
|
+
def get_redacted_config(self) -> Dict[str, Any]:
|
|
68
|
+
"""Get the redacted config."""
|
|
69
|
+
config = dataclasses.asdict(self)
|
|
70
|
+
|
|
71
|
+
config_copy = config_utils.Config(config)
|
|
72
|
+
|
|
73
|
+
for field_list in SENSITIVE_FIELDS:
|
|
74
|
+
val = config_copy.get_nested(field_list, default_value=None)
|
|
75
|
+
if val is not None:
|
|
76
|
+
config_copy.set_nested(field_list, '<redacted>')
|
|
77
|
+
return dict(**config_copy)
|
|
78
|
+
|
|
59
79
|
|
|
60
80
|
# -------------------- output data model -------------------- #
|
|
61
81
|
|
|
@@ -97,6 +117,8 @@ class InstanceInfo:
|
|
|
97
117
|
external_ip: Optional[str]
|
|
98
118
|
tags: Dict[str, str]
|
|
99
119
|
ssh_port: int = 22
|
|
120
|
+
# The internal service address of the instance on Kubernetes.
|
|
121
|
+
internal_svc: Optional[str] = None
|
|
100
122
|
|
|
101
123
|
def get_feasible_ip(self) -> str:
|
|
102
124
|
"""Get the most feasible IPs of the instance. This function returns
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
40
40
|
return head_instance_id
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
43
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
44
44
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
45
45
|
"""Runs instances for the given cluster."""
|
|
46
|
-
|
|
46
|
+
del cluster_name # unused
|
|
47
47
|
pending_status = ['pend', 'init', 'prol', 'boot']
|
|
48
48
|
|
|
49
49
|
while True:
|
|
@@ -195,9 +195,10 @@ def query_instances(
|
|
|
195
195
|
cluster_name_on_cloud: str,
|
|
196
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
197
197
|
non_terminated_only: bool = True,
|
|
198
|
+
retry_if_missing: bool = False,
|
|
198
199
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
199
200
|
"""See sky/provision/__init__.py"""
|
|
200
|
-
del cluster_name # unused
|
|
201
|
+
del cluster_name, retry_if_missing # unused
|
|
201
202
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
202
203
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
203
204
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -26,10 +26,10 @@ def _get_head_instance(
|
|
|
26
26
|
return None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
29
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
30
30
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
31
31
|
"""Runs instances for the given cluster."""
|
|
32
|
-
|
|
32
|
+
del cluster_name # unused
|
|
33
33
|
pending_status = ['new']
|
|
34
34
|
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
|
35
35
|
pending_status + ['off'])
|
|
@@ -246,9 +246,10 @@ def query_instances(
|
|
|
246
246
|
cluster_name_on_cloud: str,
|
|
247
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
248
248
|
non_terminated_only: bool = True,
|
|
249
|
+
retry_if_missing: bool = False,
|
|
249
250
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
250
251
|
"""See sky/provision/__init__.py"""
|
|
251
|
-
del cluster_name # unused
|
|
252
|
+
del cluster_name, retry_if_missing # unused
|
|
252
253
|
# terminated instances are not retrieved by the
|
|
253
254
|
# API making `non_terminated_only` argument moot.
|
|
254
255
|
del non_terminated_only
|
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
25
29
|
# Docker daemon may not be ready when the machine is firstly started. The error
|
|
26
30
|
# message starts with the following string. We should wait for a while and retry
|
|
27
31
|
# the command.
|
|
28
|
-
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
|
|
29
|
-
'the Docker daemon socket')
|
|
32
|
+
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
|
|
30
33
|
|
|
31
34
|
DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
|
35
|
+
DOCKER_SOCKET_NOT_READY_STR_2 = (
|
|
36
|
+
'check if the path is correct and if the daemon is running')
|
|
32
37
|
|
|
33
38
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
|
34
39
|
|
|
40
|
+
# Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
|
|
41
|
+
# AWS CLI v2 is installed as a standalone binary, not a Python package. See:
|
|
42
|
+
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
|
43
|
+
INSTALL_AWS_CLI_CMD = (
|
|
44
|
+
'which aws || ((command -v unzip >/dev/null 2>&1 || '
|
|
45
|
+
'(sudo apt-get update && sudo apt-get install -y unzip)) && '
|
|
46
|
+
'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
|
|
47
|
+
'-o "/tmp/awscliv2.zip" && '
|
|
48
|
+
'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
|
|
49
|
+
'&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_region_from_ecr_server(server: str) -> str:
|
|
53
|
+
"""Extract AWS region from ECR server URL.
|
|
54
|
+
|
|
55
|
+
ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
56
|
+
Returns the region part from the URL.
|
|
57
|
+
"""
|
|
58
|
+
# Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
|
|
59
|
+
parts = server.split('.')
|
|
60
|
+
if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
|
|
61
|
+
return parts[3]
|
|
62
|
+
raise ValueError(f'Invalid ECR server format: {server}')
|
|
63
|
+
|
|
35
64
|
|
|
36
65
|
@dataclasses.dataclass
|
|
37
66
|
class DockerLoginConfig:
|
|
@@ -147,6 +176,17 @@ def _with_interactive(cmd):
|
|
|
147
176
|
return ['bash', '--login', '-c', '-i', shlex.quote(force_interactive)]
|
|
148
177
|
|
|
149
178
|
|
|
179
|
+
def _redact_docker_password(cmd: str) -> str:
|
|
180
|
+
parts = shlex.split(cmd)
|
|
181
|
+
for i, part in enumerate(parts):
|
|
182
|
+
if part.startswith('--password'):
|
|
183
|
+
if part.startswith('--password='):
|
|
184
|
+
parts[i] = '--password=<redacted>'
|
|
185
|
+
elif i + 1 < len(parts):
|
|
186
|
+
parts[i + 1] = '<redacted>'
|
|
187
|
+
return ' '.join(parts)
|
|
188
|
+
|
|
189
|
+
|
|
150
190
|
# SkyPilot: New class to initialize docker containers on a remote node.
|
|
151
191
|
# Adopted from ray.autoscaler._private.command_runner.DockerCommandRunner.
|
|
152
192
|
class DockerInitializer:
|
|
@@ -157,19 +197,23 @@ class DockerInitializer:
|
|
|
157
197
|
self.docker_config = docker_config
|
|
158
198
|
self.container_name = docker_config['container_name']
|
|
159
199
|
self.runner = runner
|
|
160
|
-
self.home_dir = None
|
|
200
|
+
self.home_dir: Optional[str] = None
|
|
161
201
|
self.initialized = False
|
|
162
202
|
# podman is not fully tested yet.
|
|
163
203
|
use_podman = docker_config.get('use_podman', False)
|
|
164
204
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
165
205
|
self.log_path = log_path
|
|
166
206
|
|
|
167
|
-
def _run(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
207
|
+
def _run(
|
|
208
|
+
self,
|
|
209
|
+
cmd,
|
|
210
|
+
run_env='host',
|
|
211
|
+
wait_for_docker_daemon: bool = False,
|
|
212
|
+
separate_stderr: bool = False,
|
|
213
|
+
log_err_when_fail: bool = True,
|
|
214
|
+
flock_name: Optional[str] = None,
|
|
215
|
+
flock_args: Optional[str] = None,
|
|
216
|
+
) -> str:
|
|
173
217
|
|
|
174
218
|
if run_env == 'docker':
|
|
175
219
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -178,10 +222,17 @@ class DockerInitializer:
|
|
|
178
222
|
# an error: `the input device is not a TTY`, and it works without
|
|
179
223
|
# `-it` flag.
|
|
180
224
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
181
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
182
|
-
f' {shlex.quote(cmd)} ')
|
|
225
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
226
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
227
|
+
|
|
228
|
+
if flock_name is not None:
|
|
229
|
+
flock_args = flock_args or ''
|
|
230
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
231
|
+
f'-c {shlex.quote(cmd)}')
|
|
183
232
|
|
|
184
|
-
|
|
233
|
+
# Redact the password in the login command.
|
|
234
|
+
redacted_cmd = _redact_docker_password(cmd)
|
|
235
|
+
logger.debug(f'+ {redacted_cmd}')
|
|
185
236
|
start = time.time()
|
|
186
237
|
while True:
|
|
187
238
|
rc, stdout, stderr = self.runner.run(
|
|
@@ -191,7 +242,8 @@ class DockerInitializer:
|
|
|
191
242
|
separate_stderr=separate_stderr,
|
|
192
243
|
log_path=self.log_path)
|
|
193
244
|
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
|
|
194
|
-
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr
|
|
245
|
+
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
|
|
246
|
+
DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
|
|
195
247
|
if wait_for_docker_daemon:
|
|
196
248
|
if time.time(
|
|
197
249
|
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
|
|
@@ -212,7 +264,7 @@ class DockerInitializer:
|
|
|
212
264
|
break
|
|
213
265
|
subprocess_utils.handle_returncode(
|
|
214
266
|
rc,
|
|
215
|
-
|
|
267
|
+
redacted_cmd,
|
|
216
268
|
error_msg='Failed to run docker setup commands.',
|
|
217
269
|
stderr=stdout + stderr,
|
|
218
270
|
# Print out the error message if the command failed.
|
|
@@ -231,14 +283,17 @@ class DockerInitializer:
|
|
|
231
283
|
if self._check_container_exited():
|
|
232
284
|
self.initialized = True
|
|
233
285
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
234
|
-
self._run('sudo service ssh start',
|
|
286
|
+
self._run('sudo service ssh start',
|
|
287
|
+
run_env='docker',
|
|
288
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
289
|
+
flock_args='-s -w 1')
|
|
235
290
|
return self._run('whoami', run_env='docker')
|
|
236
291
|
|
|
237
292
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
238
293
|
if 'docker_login_config' in self.docker_config:
|
|
239
|
-
# TODO(tian): Maybe support a command to get the login password?
|
|
240
294
|
docker_login_config = DockerLoginConfig(
|
|
241
295
|
**self.docker_config['docker_login_config'])
|
|
296
|
+
|
|
242
297
|
if docker_login_config.password:
|
|
243
298
|
# Password is allowed to be empty, in that case, we will not run
|
|
244
299
|
# the login command, and assume that the image pulling is
|
|
@@ -249,6 +304,25 @@ class DockerInitializer:
|
|
|
249
304
|
f'--password {shlex.quote(docker_login_config.password)} '
|
|
250
305
|
f'{shlex.quote(docker_login_config.server)}',
|
|
251
306
|
wait_for_docker_daemon=True)
|
|
307
|
+
elif (docker_login_config.server.endswith('.amazonaws.com') and
|
|
308
|
+
'.dkr.ecr.' in docker_login_config.server):
|
|
309
|
+
# AWS ECR: Use aws ecr get-login-password for authentication
|
|
310
|
+
# ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
311
|
+
# This command uses the IAM credentials from the EC2 instance
|
|
312
|
+
# Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
|
|
313
|
+
region = _extract_region_from_ecr_server(
|
|
314
|
+
docker_login_config.server)
|
|
315
|
+
|
|
316
|
+
# AWS CLI is not pre-installed on AWS instances, unlike gcloud
|
|
317
|
+
# on GCP instances, so we need to install it first
|
|
318
|
+
self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
|
|
319
|
+
|
|
320
|
+
self._run(
|
|
321
|
+
f'aws ecr get-login-password --region {region} | '
|
|
322
|
+
f'{self.docker_cmd} login --username AWS '
|
|
323
|
+
f'--password-stdin '
|
|
324
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
325
|
+
wait_for_docker_daemon=True)
|
|
252
326
|
elif docker_login_config.server.endswith('-docker.pkg.dev'):
|
|
253
327
|
# Docker image server is on GCR, we need to do additional setup
|
|
254
328
|
# to pull the image.
|
|
@@ -311,7 +385,9 @@ class DockerInitializer:
|
|
|
311
385
|
self._auto_configure_shm(user_docker_run_options)),
|
|
312
386
|
self.docker_cmd,
|
|
313
387
|
)
|
|
314
|
-
self._run(f'{remove_container_cmd}
|
|
388
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
389
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
390
|
+
flock_args='-x -w 10')
|
|
315
391
|
|
|
316
392
|
# SkyPilot: Setup Commands.
|
|
317
393
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -329,14 +405,18 @@ class DockerInitializer:
|
|
|
329
405
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
330
406
|
run_env='docker')
|
|
331
407
|
# Install dependencies.
|
|
332
|
-
|
|
333
|
-
'
|
|
408
|
+
cmd = (
|
|
409
|
+
'bash -lc \''
|
|
410
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
411
|
+
'flock -x -w 120 200 || exit 1; '
|
|
412
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
413
|
+
'apt-get -yq update && '
|
|
334
414
|
# Our mount script will install gcsfuse without fuse package.
|
|
335
415
|
# We need to install fuse package first to enable storage mount.
|
|
336
416
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
337
|
-
'
|
|
338
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
339
|
-
|
|
417
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
418
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
419
|
+
self._run(cmd, run_env='docker')
|
|
340
420
|
|
|
341
421
|
# Copy local authorized_keys to docker container.
|
|
342
422
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -367,7 +447,7 @@ class DockerInitializer:
|
|
|
367
447
|
# pylint: disable=anomalous-backslash-in-string
|
|
368
448
|
self._run(
|
|
369
449
|
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
370
|
-
f'
|
|
450
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
371
451
|
'mkdir -p ~/.ssh;'
|
|
372
452
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
373
453
|
'sudo service ssh start;'
|
|
@@ -412,9 +492,13 @@ class DockerInitializer:
|
|
|
412
492
|
user_pos = string.find('~')
|
|
413
493
|
if user_pos > -1:
|
|
414
494
|
if self.home_dir is None:
|
|
415
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
416
|
-
'printenv HOME')
|
|
417
|
-
self.home_dir = self._run(
|
|
495
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
496
|
+
' printenv HOME')
|
|
497
|
+
self.home_dir = self._run(
|
|
498
|
+
cmd,
|
|
499
|
+
separate_stderr=True,
|
|
500
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
501
|
+
flock_args='-s -w 1')
|
|
418
502
|
# Check for unexpected newline in home directory, which can be
|
|
419
503
|
# a common issue when the output is mixed with stderr.
|
|
420
504
|
assert '\n' not in self.home_dir, (
|
|
@@ -3,11 +3,11 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from sky import authentication as auth
|
|
7
6
|
from sky import exceptions
|
|
8
7
|
from sky import sky_logging
|
|
9
8
|
from sky.provision import common
|
|
10
9
|
from sky.provision.fluidstack import fluidstack_utils as utils
|
|
10
|
+
from sky.utils import auth_utils
|
|
11
11
|
from sky.utils import command_runner
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
|
29
29
|
|
|
30
|
-
private_key_path, _ =
|
|
30
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
31
31
|
runner = command_runner.SSHCommandRunner(
|
|
32
32
|
(node_info['ip_address'], 22),
|
|
33
33
|
ssh_user='ubuntu',
|
|
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
78
78
|
return head_instance_id
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
82
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
83
83
|
"""Runs instances for the given cluster."""
|
|
84
|
-
|
|
84
|
+
del cluster_name # unused
|
|
85
85
|
pending_status = ['pending', 'provisioning']
|
|
86
86
|
while True:
|
|
87
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
@@ -291,9 +291,10 @@ def query_instances(
|
|
|
291
291
|
cluster_name_on_cloud: str,
|
|
292
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
293
293
|
non_terminated_only: bool = True,
|
|
294
|
+
retry_if_missing: bool = False,
|
|
294
295
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
295
296
|
"""See sky/provision/__init__.py"""
|
|
296
|
-
del cluster_name # unused
|
|
297
|
+
del cluster_name, retry_if_missing # unused
|
|
297
298
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
298
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
299
300
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
sky/provision/gcp/config.py
CHANGED
|
@@ -5,6 +5,8 @@ import time
|
|
|
5
5
|
import typing
|
|
6
6
|
from typing import Any, Dict, List, Set, Tuple
|
|
7
7
|
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
8
10
|
from sky.adaptors import gcp
|
|
9
11
|
from sky.clouds.utils import gcp_utils
|
|
10
12
|
from sky.provision import common
|
|
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
|
415
417
|
return iam_role
|
|
416
418
|
|
|
417
419
|
|
|
420
|
+
AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
|
|
421
|
+
|
|
422
|
+
|
|
418
423
|
def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
419
424
|
compute):
|
|
420
425
|
"""Check if the firewall rules in the VPC are sufficient."""
|
|
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
466
471
|
}
|
|
467
472
|
"""
|
|
468
473
|
source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
|
|
469
|
-
source2allowed_list: Dict[Tuple[str, str], List[
|
|
474
|
+
source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
|
|
470
475
|
for rule in rules:
|
|
471
476
|
# Rules applied to specific VM (targetTags) may not work for the
|
|
472
477
|
# current VM, so should be skipped.
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -62,9 +62,10 @@ def query_instances(
|
|
|
62
62
|
cluster_name_on_cloud: str,
|
|
63
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
64
64
|
non_terminated_only: bool = True,
|
|
65
|
+
retry_if_missing: bool = False,
|
|
65
66
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
66
67
|
"""See sky/provision/__init__.py"""
|
|
67
|
-
del cluster_name # unused
|
|
68
|
+
del cluster_name, retry_if_missing # unused
|
|
68
69
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
69
70
|
zone = provider_config['availability_zone']
|
|
70
71
|
project_id = provider_config['project_id']
|
|
@@ -360,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
360
361
|
created_instance_ids=created_instance_ids)
|
|
361
362
|
|
|
362
363
|
|
|
363
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
364
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
364
365
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
365
366
|
"""See sky/provision/__init__.py"""
|
|
367
|
+
del cluster_name # unused
|
|
366
368
|
try:
|
|
367
369
|
return _run_instances(region, cluster_name_on_cloud, config)
|
|
368
370
|
except gcp.http_error_exception() as e:
|
|
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
64
64
|
return next(iter(instances.keys()))
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
67
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
68
68
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
69
|
+
del cluster_name # unused
|
|
69
70
|
logger.info(f'Starting run_instances with region={region}, '
|
|
70
71
|
f'cluster={cluster_name_on_cloud}')
|
|
71
72
|
logger.debug(f'Config: {config}')
|
|
@@ -308,9 +309,10 @@ def query_instances(
|
|
|
308
309
|
cluster_name_on_cloud: str,
|
|
309
310
|
provider_config: Optional[dict] = None,
|
|
310
311
|
non_terminated_only: bool = True,
|
|
312
|
+
retry_if_missing: bool = False,
|
|
311
313
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
312
314
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
313
|
-
del cluster_name, provider_config # unused
|
|
315
|
+
del cluster_name, provider_config, retry_if_missing # unused
|
|
314
316
|
# Fetch all instances for this cluster
|
|
315
317
|
instances = utils.list_instances(
|
|
316
318
|
metadata={'skypilot': {
|