skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/aws/instance.py
CHANGED
|
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
|
|
|
311
311
|
return head_instance_id
|
|
312
312
|
|
|
313
313
|
|
|
314
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
314
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
315
315
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
316
316
|
"""See sky/provision/__init__.py"""
|
|
317
|
+
del cluster_name # unused
|
|
317
318
|
ec2 = _default_ec2_resource(region)
|
|
318
319
|
# NOTE: We set max_attempts=0 for fast failing when the resource is not
|
|
319
320
|
# available (although the doc says it will only retry for network
|
|
@@ -629,9 +630,10 @@ def query_instances(
|
|
|
629
630
|
cluster_name_on_cloud: str,
|
|
630
631
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
631
632
|
non_terminated_only: bool = True,
|
|
633
|
+
retry_if_missing: bool = False,
|
|
632
634
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
633
635
|
"""See sky/provision/__init__.py"""
|
|
634
|
-
del cluster_name # unused
|
|
636
|
+
del cluster_name, retry_if_missing # unused
|
|
635
637
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
636
638
|
region = provider_config['region']
|
|
637
639
|
ec2 = _default_ec2_resource(region)
|
|
@@ -743,6 +745,7 @@ def terminate_instances(
|
|
|
743
745
|
|
|
744
746
|
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
745
747
|
def modify_instance_sg(instance):
|
|
748
|
+
assert default_sg is not None # Type narrowing for mypy
|
|
746
749
|
instance.modify_attribute(Groups=[default_sg.id])
|
|
747
750
|
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
748
751
|
f'{default_sg.id} for quick deletion.')
|
sky/provision/azure/instance.py
CHANGED
|
@@ -214,7 +214,7 @@ def _create_network_interface(
|
|
|
214
214
|
location=provider_config['location'],
|
|
215
215
|
public_ip_allocation_method='Static',
|
|
216
216
|
public_ip_address_version='IPv4',
|
|
217
|
-
sku=network.PublicIPAddressSku(name='
|
|
217
|
+
sku=network.PublicIPAddressSku(name='Standard', tier='Regional'))
|
|
218
218
|
ip_poller = network_client.public_ip_addresses.begin_create_or_update(
|
|
219
219
|
resource_group_name=provider_config['resource_group'],
|
|
220
220
|
public_ip_address_name=f'{vm_name}-ip',
|
|
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
|
362
362
|
return instances
|
|
363
363
|
|
|
364
364
|
|
|
365
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
365
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
366
366
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
367
367
|
"""See sky/provision/__init__.py"""
|
|
368
|
+
del cluster_name # unused
|
|
368
369
|
# TODO(zhwu): This function is too long. We should refactor it.
|
|
369
370
|
provider_config = config.provider_config
|
|
370
371
|
resource_group = provider_config['resource_group']
|
|
@@ -956,9 +957,10 @@ def query_instances(
|
|
|
956
957
|
cluster_name_on_cloud: str,
|
|
957
958
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
958
959
|
non_terminated_only: bool = True,
|
|
960
|
+
retry_if_missing: bool = False,
|
|
959
961
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
960
962
|
"""See sky/provision/__init__.py"""
|
|
961
|
-
del cluster_name # unused
|
|
963
|
+
del cluster_name, retry_if_missing # unused
|
|
962
964
|
assert provider_config is not None, cluster_name_on_cloud
|
|
963
965
|
|
|
964
966
|
subscription_id = provider_config['subscription_id']
|
sky/provision/common.py
CHANGED
|
@@ -97,6 +97,8 @@ class InstanceInfo:
|
|
|
97
97
|
external_ip: Optional[str]
|
|
98
98
|
tags: Dict[str, str]
|
|
99
99
|
ssh_port: int = 22
|
|
100
|
+
# The internal service address of the instance on Kubernetes.
|
|
101
|
+
internal_svc: Optional[str] = None
|
|
100
102
|
|
|
101
103
|
def get_feasible_ip(self) -> str:
|
|
102
104
|
"""Get the most feasible IPs of the instance. This function returns
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
40
40
|
return head_instance_id
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
43
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
44
44
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
45
45
|
"""Runs instances for the given cluster."""
|
|
46
|
-
|
|
46
|
+
del cluster_name # unused
|
|
47
47
|
pending_status = ['pend', 'init', 'prol', 'boot']
|
|
48
48
|
|
|
49
49
|
while True:
|
|
@@ -195,9 +195,10 @@ def query_instances(
|
|
|
195
195
|
cluster_name_on_cloud: str,
|
|
196
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
197
197
|
non_terminated_only: bool = True,
|
|
198
|
+
retry_if_missing: bool = False,
|
|
198
199
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
199
200
|
"""See sky/provision/__init__.py"""
|
|
200
|
-
del cluster_name # unused
|
|
201
|
+
del cluster_name, retry_if_missing # unused
|
|
201
202
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
202
203
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
203
204
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -26,10 +26,10 @@ def _get_head_instance(
|
|
|
26
26
|
return None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
29
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
30
30
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
31
31
|
"""Runs instances for the given cluster."""
|
|
32
|
-
|
|
32
|
+
del cluster_name # unused
|
|
33
33
|
pending_status = ['new']
|
|
34
34
|
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
|
35
35
|
pending_status + ['off'])
|
|
@@ -246,9 +246,10 @@ def query_instances(
|
|
|
246
246
|
cluster_name_on_cloud: str,
|
|
247
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
248
248
|
non_terminated_only: bool = True,
|
|
249
|
+
retry_if_missing: bool = False,
|
|
249
250
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
250
251
|
"""See sky/provision/__init__.py"""
|
|
251
|
-
del cluster_name # unused
|
|
252
|
+
del cluster_name, retry_if_missing # unused
|
|
252
253
|
# terminated instances are not retrieved by the
|
|
253
254
|
# API making `non_terminated_only` argument moot.
|
|
254
255
|
del non_terminated_only
|
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
25
29
|
# Docker daemon may not be ready when the machine is firstly started. The error
|
|
26
30
|
# message starts with the following string. We should wait for a while and retry
|
|
27
31
|
# the command.
|
|
28
|
-
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
|
|
29
|
-
'the Docker daemon socket')
|
|
32
|
+
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
|
|
30
33
|
|
|
31
34
|
DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
|
35
|
+
DOCKER_SOCKET_NOT_READY_STR_2 = (
|
|
36
|
+
'check if the path is correct and if the daemon is running')
|
|
32
37
|
|
|
33
38
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
|
34
39
|
|
|
40
|
+
# Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
|
|
41
|
+
# AWS CLI v2 is installed as a standalone binary, not a Python package. See:
|
|
42
|
+
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
|
43
|
+
INSTALL_AWS_CLI_CMD = (
|
|
44
|
+
'which aws || ((command -v unzip >/dev/null 2>&1 || '
|
|
45
|
+
'(sudo apt-get update && sudo apt-get install -y unzip)) && '
|
|
46
|
+
'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
|
|
47
|
+
'-o "/tmp/awscliv2.zip" && '
|
|
48
|
+
'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
|
|
49
|
+
'&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_region_from_ecr_server(server: str) -> str:
|
|
53
|
+
"""Extract AWS region from ECR server URL.
|
|
54
|
+
|
|
55
|
+
ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
56
|
+
Returns the region part from the URL.
|
|
57
|
+
"""
|
|
58
|
+
# Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
|
|
59
|
+
parts = server.split('.')
|
|
60
|
+
if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
|
|
61
|
+
return parts[3]
|
|
62
|
+
raise ValueError(f'Invalid ECR server format: {server}')
|
|
63
|
+
|
|
35
64
|
|
|
36
65
|
@dataclasses.dataclass
|
|
37
66
|
class DockerLoginConfig:
|
|
@@ -157,19 +186,23 @@ class DockerInitializer:
|
|
|
157
186
|
self.docker_config = docker_config
|
|
158
187
|
self.container_name = docker_config['container_name']
|
|
159
188
|
self.runner = runner
|
|
160
|
-
self.home_dir = None
|
|
189
|
+
self.home_dir: Optional[str] = None
|
|
161
190
|
self.initialized = False
|
|
162
191
|
# podman is not fully tested yet.
|
|
163
192
|
use_podman = docker_config.get('use_podman', False)
|
|
164
193
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
165
194
|
self.log_path = log_path
|
|
166
195
|
|
|
167
|
-
def _run(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
196
|
+
def _run(
|
|
197
|
+
self,
|
|
198
|
+
cmd,
|
|
199
|
+
run_env='host',
|
|
200
|
+
wait_for_docker_daemon: bool = False,
|
|
201
|
+
separate_stderr: bool = False,
|
|
202
|
+
log_err_when_fail: bool = True,
|
|
203
|
+
flock_name: Optional[str] = None,
|
|
204
|
+
flock_args: Optional[str] = None,
|
|
205
|
+
) -> str:
|
|
173
206
|
|
|
174
207
|
if run_env == 'docker':
|
|
175
208
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -178,8 +211,13 @@ class DockerInitializer:
|
|
|
178
211
|
# an error: `the input device is not a TTY`, and it works without
|
|
179
212
|
# `-it` flag.
|
|
180
213
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
181
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
182
|
-
f' {shlex.quote(cmd)} ')
|
|
214
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
215
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
216
|
+
|
|
217
|
+
if flock_name is not None:
|
|
218
|
+
flock_args = flock_args or ''
|
|
219
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
220
|
+
f'-c {shlex.quote(cmd)}')
|
|
183
221
|
|
|
184
222
|
logger.debug(f'+ {cmd}')
|
|
185
223
|
start = time.time()
|
|
@@ -191,7 +229,8 @@ class DockerInitializer:
|
|
|
191
229
|
separate_stderr=separate_stderr,
|
|
192
230
|
log_path=self.log_path)
|
|
193
231
|
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
|
|
194
|
-
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr
|
|
232
|
+
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
|
|
233
|
+
DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
|
|
195
234
|
if wait_for_docker_daemon:
|
|
196
235
|
if time.time(
|
|
197
236
|
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
|
|
@@ -231,14 +270,17 @@ class DockerInitializer:
|
|
|
231
270
|
if self._check_container_exited():
|
|
232
271
|
self.initialized = True
|
|
233
272
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
234
|
-
self._run('sudo service ssh start',
|
|
273
|
+
self._run('sudo service ssh start',
|
|
274
|
+
run_env='docker',
|
|
275
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
276
|
+
flock_args='-s -w 1')
|
|
235
277
|
return self._run('whoami', run_env='docker')
|
|
236
278
|
|
|
237
279
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
238
280
|
if 'docker_login_config' in self.docker_config:
|
|
239
|
-
# TODO(tian): Maybe support a command to get the login password?
|
|
240
281
|
docker_login_config = DockerLoginConfig(
|
|
241
282
|
**self.docker_config['docker_login_config'])
|
|
283
|
+
|
|
242
284
|
if docker_login_config.password:
|
|
243
285
|
# Password is allowed to be empty, in that case, we will not run
|
|
244
286
|
# the login command, and assume that the image pulling is
|
|
@@ -249,6 +291,25 @@ class DockerInitializer:
|
|
|
249
291
|
f'--password {shlex.quote(docker_login_config.password)} '
|
|
250
292
|
f'{shlex.quote(docker_login_config.server)}',
|
|
251
293
|
wait_for_docker_daemon=True)
|
|
294
|
+
elif (docker_login_config.server.endswith('.amazonaws.com') and
|
|
295
|
+
'.dkr.ecr.' in docker_login_config.server):
|
|
296
|
+
# AWS ECR: Use aws ecr get-login-password for authentication
|
|
297
|
+
# ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
298
|
+
# This command uses the IAM credentials from the EC2 instance
|
|
299
|
+
# Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
|
|
300
|
+
region = _extract_region_from_ecr_server(
|
|
301
|
+
docker_login_config.server)
|
|
302
|
+
|
|
303
|
+
# AWS CLI is not pre-installed on AWS instances, unlike gcloud
|
|
304
|
+
# on GCP instances, so we need to install it first
|
|
305
|
+
self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
|
|
306
|
+
|
|
307
|
+
self._run(
|
|
308
|
+
f'aws ecr get-login-password --region {region} | '
|
|
309
|
+
f'{self.docker_cmd} login --username AWS '
|
|
310
|
+
f'--password-stdin '
|
|
311
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
312
|
+
wait_for_docker_daemon=True)
|
|
252
313
|
elif docker_login_config.server.endswith('-docker.pkg.dev'):
|
|
253
314
|
# Docker image server is on GCR, we need to do additional setup
|
|
254
315
|
# to pull the image.
|
|
@@ -311,7 +372,9 @@ class DockerInitializer:
|
|
|
311
372
|
self._auto_configure_shm(user_docker_run_options)),
|
|
312
373
|
self.docker_cmd,
|
|
313
374
|
)
|
|
314
|
-
self._run(f'{remove_container_cmd}
|
|
375
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
376
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
377
|
+
flock_args='-x -w 10')
|
|
315
378
|
|
|
316
379
|
# SkyPilot: Setup Commands.
|
|
317
380
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -329,14 +392,18 @@ class DockerInitializer:
|
|
|
329
392
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
330
393
|
run_env='docker')
|
|
331
394
|
# Install dependencies.
|
|
332
|
-
|
|
333
|
-
'
|
|
395
|
+
cmd = (
|
|
396
|
+
'bash -lc \''
|
|
397
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
398
|
+
'flock -x -w 120 200 || exit 1; '
|
|
399
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
400
|
+
'apt-get -yq update && '
|
|
334
401
|
# Our mount script will install gcsfuse without fuse package.
|
|
335
402
|
# We need to install fuse package first to enable storage mount.
|
|
336
403
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
337
|
-
'
|
|
338
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
339
|
-
|
|
404
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
405
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
406
|
+
self._run(cmd, run_env='docker')
|
|
340
407
|
|
|
341
408
|
# Copy local authorized_keys to docker container.
|
|
342
409
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -367,7 +434,7 @@ class DockerInitializer:
|
|
|
367
434
|
# pylint: disable=anomalous-backslash-in-string
|
|
368
435
|
self._run(
|
|
369
436
|
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
370
|
-
f'
|
|
437
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
371
438
|
'mkdir -p ~/.ssh;'
|
|
372
439
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
373
440
|
'sudo service ssh start;'
|
|
@@ -412,9 +479,13 @@ class DockerInitializer:
|
|
|
412
479
|
user_pos = string.find('~')
|
|
413
480
|
if user_pos > -1:
|
|
414
481
|
if self.home_dir is None:
|
|
415
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
416
|
-
'printenv HOME')
|
|
417
|
-
self.home_dir = self._run(
|
|
482
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
483
|
+
' printenv HOME')
|
|
484
|
+
self.home_dir = self._run(
|
|
485
|
+
cmd,
|
|
486
|
+
separate_stderr=True,
|
|
487
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
488
|
+
flock_args='-s -w 1')
|
|
418
489
|
# Check for unexpected newline in home directory, which can be
|
|
419
490
|
# a common issue when the output is mixed with stderr.
|
|
420
491
|
assert '\n' not in self.home_dir, (
|
|
@@ -3,11 +3,11 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from sky import authentication as auth
|
|
7
6
|
from sky import exceptions
|
|
8
7
|
from sky import sky_logging
|
|
9
8
|
from sky.provision import common
|
|
10
9
|
from sky.provision.fluidstack import fluidstack_utils as utils
|
|
10
|
+
from sky.utils import auth_utils
|
|
11
11
|
from sky.utils import command_runner
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
|
29
29
|
|
|
30
|
-
private_key_path, _ =
|
|
30
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
31
31
|
runner = command_runner.SSHCommandRunner(
|
|
32
32
|
(node_info['ip_address'], 22),
|
|
33
33
|
ssh_user='ubuntu',
|
|
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
78
78
|
return head_instance_id
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
82
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
83
83
|
"""Runs instances for the given cluster."""
|
|
84
|
-
|
|
84
|
+
del cluster_name # unused
|
|
85
85
|
pending_status = ['pending', 'provisioning']
|
|
86
86
|
while True:
|
|
87
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
@@ -291,9 +291,10 @@ def query_instances(
|
|
|
291
291
|
cluster_name_on_cloud: str,
|
|
292
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
293
293
|
non_terminated_only: bool = True,
|
|
294
|
+
retry_if_missing: bool = False,
|
|
294
295
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
295
296
|
"""See sky/provision/__init__.py"""
|
|
296
|
-
del cluster_name # unused
|
|
297
|
+
del cluster_name, retry_if_missing # unused
|
|
297
298
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
298
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
299
300
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
sky/provision/gcp/config.py
CHANGED
|
@@ -5,6 +5,8 @@ import time
|
|
|
5
5
|
import typing
|
|
6
6
|
from typing import Any, Dict, List, Set, Tuple
|
|
7
7
|
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
8
10
|
from sky.adaptors import gcp
|
|
9
11
|
from sky.clouds.utils import gcp_utils
|
|
10
12
|
from sky.provision import common
|
|
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
|
415
417
|
return iam_role
|
|
416
418
|
|
|
417
419
|
|
|
420
|
+
AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
|
|
421
|
+
|
|
422
|
+
|
|
418
423
|
def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
419
424
|
compute):
|
|
420
425
|
"""Check if the firewall rules in the VPC are sufficient."""
|
|
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
466
471
|
}
|
|
467
472
|
"""
|
|
468
473
|
source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
|
|
469
|
-
source2allowed_list: Dict[Tuple[str, str], List[
|
|
474
|
+
source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
|
|
470
475
|
for rule in rules:
|
|
471
476
|
# Rules applied to specific VM (targetTags) may not work for the
|
|
472
477
|
# current VM, so should be skipped.
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -62,9 +62,10 @@ def query_instances(
|
|
|
62
62
|
cluster_name_on_cloud: str,
|
|
63
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
64
64
|
non_terminated_only: bool = True,
|
|
65
|
+
retry_if_missing: bool = False,
|
|
65
66
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
66
67
|
"""See sky/provision/__init__.py"""
|
|
67
|
-
del cluster_name # unused
|
|
68
|
+
del cluster_name, retry_if_missing # unused
|
|
68
69
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
69
70
|
zone = provider_config['availability_zone']
|
|
70
71
|
project_id = provider_config['project_id']
|
|
@@ -360,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
360
361
|
created_instance_ids=created_instance_ids)
|
|
361
362
|
|
|
362
363
|
|
|
363
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
364
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
364
365
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
365
366
|
"""See sky/provision/__init__.py"""
|
|
367
|
+
del cluster_name # unused
|
|
366
368
|
try:
|
|
367
369
|
return _run_instances(region, cluster_name_on_cloud, config)
|
|
368
370
|
except gcp.http_error_exception() as e:
|
|
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
64
64
|
return next(iter(instances.keys()))
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
67
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
68
68
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
69
|
+
del cluster_name # unused
|
|
69
70
|
logger.info(f'Starting run_instances with region={region}, '
|
|
70
71
|
f'cluster={cluster_name_on_cloud}')
|
|
71
72
|
logger.debug(f'Config: {config}')
|
|
@@ -308,9 +309,10 @@ def query_instances(
|
|
|
308
309
|
cluster_name_on_cloud: str,
|
|
309
310
|
provider_config: Optional[dict] = None,
|
|
310
311
|
non_terminated_only: bool = True,
|
|
312
|
+
retry_if_missing: bool = False,
|
|
311
313
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
312
314
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
313
|
-
del cluster_name, provider_config # unused
|
|
315
|
+
del cluster_name, provider_config, retry_if_missing # unused
|
|
314
316
|
# Fetch all instances for this cluster
|
|
315
317
|
instances = utils.list_instances(
|
|
316
318
|
metadata={'skypilot': {
|
sky/provision/instance_setup.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import logs
|
|
12
12
|
from sky import provision
|
|
13
|
+
from sky import resources as resources_lib
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.provision import common
|
|
15
16
|
from sky.provision import docker_utils
|
|
@@ -38,11 +39,13 @@ _RAY_PRLIMIT = (
|
|
|
38
39
|
'which prlimit && for id in $(pgrep -f raylet/raylet); '
|
|
39
40
|
'do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;')
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
DUMP_RAY_PORTS = (f'{constants.SKY_PYTHON_CMD} -c \'import json, os; '
|
|
43
|
+
f'runtime_dir = os.path.expanduser(os.environ.get('
|
|
44
|
+
f'"{constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}", "~")); '
|
|
45
|
+
f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
|
|
46
|
+
f'open(os.path.join(runtime_dir, '
|
|
47
|
+
f'"{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", '
|
|
48
|
+
'encoding="utf-8"))\';')
|
|
46
49
|
|
|
47
50
|
_RAY_PORT_COMMAND = (
|
|
48
51
|
f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
|
|
@@ -84,7 +87,7 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
84
87
|
latest one when the function is called.
|
|
85
88
|
"""
|
|
86
89
|
return (
|
|
87
|
-
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
|
90
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
|
|
88
91
|
# The run id is retrieved locally for the current run, so that the
|
|
89
92
|
# remote cluster will be set with the same run id as the initial
|
|
90
93
|
# launch operation.
|
|
@@ -92,12 +95,6 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
92
95
|
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
|
93
96
|
|
|
94
97
|
|
|
95
|
-
def _set_skypilot_env_var_cmd() -> str:
|
|
96
|
-
"""Sets the skypilot environment variables on the remote machine."""
|
|
97
|
-
env_vars = env_options.Options.all_options()
|
|
98
|
-
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
99
|
-
|
|
100
|
-
|
|
101
98
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
|
102
99
|
"""Decorator that retries the function if it fails.
|
|
103
100
|
|
|
@@ -136,6 +133,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
|
|
|
136
133
|
logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
|
|
137
134
|
|
|
138
135
|
|
|
136
|
+
class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
|
|
137
|
+
"""ThreadPoolExecutor that kills children processes on exit."""
|
|
138
|
+
|
|
139
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
140
|
+
# ssh command runner eventually calls
|
|
141
|
+
# log_lib.run_with_log, which will spawn
|
|
142
|
+
# subprocesses. If we are exiting the context
|
|
143
|
+
# we need to kill the children processes
|
|
144
|
+
# to avoid leakage.
|
|
145
|
+
subprocess_utils.kill_children_processes()
|
|
146
|
+
self.shutdown()
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
|
|
139
150
|
def _parallel_ssh_with_cache(func,
|
|
140
151
|
cluster_name: str,
|
|
141
152
|
stage_name: str,
|
|
@@ -148,7 +159,7 @@ def _parallel_ssh_with_cache(func,
|
|
|
148
159
|
# as 32 is too large for some machines.
|
|
149
160
|
max_workers = subprocess_utils.get_parallel_threads(
|
|
150
161
|
cluster_info.provider_name)
|
|
151
|
-
with
|
|
162
|
+
with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
152
163
|
results = []
|
|
153
164
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
154
165
|
cluster_info, **ssh_credentials)
|
|
@@ -317,7 +328,7 @@ def ray_head_start_command(custom_resource: Optional[str],
|
|
|
317
328
|
# the warning when the worker count is >12x CPUs.
|
|
318
329
|
'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
|
|
319
330
|
f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
|
|
320
|
-
_RAY_PRLIMIT +
|
|
331
|
+
_RAY_PRLIMIT + DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
|
|
321
332
|
return cmd
|
|
322
333
|
|
|
323
334
|
|
|
@@ -425,8 +436,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
425
436
|
# use the external IP of the head node.
|
|
426
437
|
use_external_ip = cluster_info.custom_ray_options.pop(
|
|
427
438
|
'use_external_ip', False)
|
|
428
|
-
|
|
429
|
-
|
|
439
|
+
|
|
440
|
+
if use_external_ip:
|
|
441
|
+
head_ip = head_instance.external_ip
|
|
442
|
+
else:
|
|
443
|
+
# For Kubernetes, use the internal service address of the head node.
|
|
444
|
+
# Keep this consistent with the logic in kubernetes-ray.yml.j2
|
|
445
|
+
if head_instance.internal_svc:
|
|
446
|
+
head_ip = head_instance.internal_svc
|
|
447
|
+
else:
|
|
448
|
+
head_ip = head_instance.internal_ip
|
|
430
449
|
|
|
431
450
|
ray_cmd = ray_worker_start_command(custom_resource,
|
|
432
451
|
cluster_info.custom_ray_options,
|
|
@@ -468,11 +487,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
468
487
|
@common.log_function_start_end
|
|
469
488
|
@_auto_retry()
|
|
470
489
|
@timeline.event
|
|
471
|
-
def start_skylet_on_head_node(
|
|
472
|
-
|
|
473
|
-
|
|
490
|
+
def start_skylet_on_head_node(
|
|
491
|
+
cluster_name: resources_utils.ClusterName,
|
|
492
|
+
cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
|
|
493
|
+
launched_resources: resources_lib.Resources) -> None:
|
|
474
494
|
"""Start skylet on the head node."""
|
|
475
|
-
|
|
495
|
+
# Avoid circular import.
|
|
496
|
+
# pylint: disable=import-outside-toplevel
|
|
497
|
+
from sky.utils import controller_utils
|
|
498
|
+
|
|
499
|
+
def _set_skypilot_env_var_cmd() -> str:
|
|
500
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
|
501
|
+
env_vars = {
|
|
502
|
+
k: str(v) for (k, v) in env_options.Options.all_options().items()
|
|
503
|
+
}
|
|
504
|
+
is_controller = controller_utils.Controllers.from_name(
|
|
505
|
+
cluster_name.display_name) is not None
|
|
506
|
+
is_kubernetes = cluster_info.provider_name == 'kubernetes'
|
|
507
|
+
if is_controller and is_kubernetes:
|
|
508
|
+
# For jobs/serve controller, we pass in the CPU and memory limits
|
|
509
|
+
# when starting the skylet to handle cases where these env vars
|
|
510
|
+
# are not set on the cluster's pod spec. The skylet will read
|
|
511
|
+
# these env vars when starting (ManagedJobEvent.start()) and write
|
|
512
|
+
# it to disk.
|
|
513
|
+
resources = launched_resources.assert_launchable()
|
|
514
|
+
vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
|
|
515
|
+
resources.instance_type)
|
|
516
|
+
if vcpus is not None:
|
|
517
|
+
env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
|
|
518
|
+
if mem is not None:
|
|
519
|
+
env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
|
|
520
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
521
|
+
|
|
476
522
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
477
523
|
cluster_info, **ssh_credentials)
|
|
478
524
|
head_runner = runners[0]
|
|
@@ -13,4 +13,6 @@ from sky.provision.kubernetes.network import open_ports
|
|
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
|
14
14
|
from sky.provision.kubernetes.volume import apply_volume
|
|
15
15
|
from sky.provision.kubernetes.volume import delete_volume
|
|
16
|
+
from sky.provision.kubernetes.volume import get_all_volumes_usedby
|
|
16
17
|
from sky.provision.kubernetes.volume import get_volume_usedby
|
|
18
|
+
from sky.provision.kubernetes.volume import map_all_volumes_usedby
|