skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/provision/instance_setup.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import logs
|
|
12
12
|
from sky import provision
|
|
13
|
+
from sky import resources as resources_lib
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.provision import common
|
|
15
16
|
from sky.provision import docker_utils
|
|
@@ -38,11 +39,13 @@ _RAY_PRLIMIT = (
|
|
|
38
39
|
'which prlimit && for id in $(pgrep -f raylet/raylet); '
|
|
39
40
|
'do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;')
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
DUMP_RAY_PORTS = (f'{constants.SKY_PYTHON_CMD} -c \'import json, os; '
|
|
43
|
+
f'runtime_dir = os.path.expanduser(os.environ.get('
|
|
44
|
+
f'"{constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}", "~")); '
|
|
45
|
+
f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
|
|
46
|
+
f'open(os.path.join(runtime_dir, '
|
|
47
|
+
f'"{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", '
|
|
48
|
+
'encoding="utf-8"))\';')
|
|
46
49
|
|
|
47
50
|
_RAY_PORT_COMMAND = (
|
|
48
51
|
f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
|
|
@@ -84,7 +87,7 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
84
87
|
latest one when the function is called.
|
|
85
88
|
"""
|
|
86
89
|
return (
|
|
87
|
-
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
|
90
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
|
|
88
91
|
# The run id is retrieved locally for the current run, so that the
|
|
89
92
|
# remote cluster will be set with the same run id as the initial
|
|
90
93
|
# launch operation.
|
|
@@ -92,12 +95,6 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
92
95
|
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
|
93
96
|
|
|
94
97
|
|
|
95
|
-
def _set_skypilot_env_var_cmd() -> str:
|
|
96
|
-
"""Sets the skypilot environment variables on the remote machine."""
|
|
97
|
-
env_vars = env_options.Options.all_options()
|
|
98
|
-
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
99
|
-
|
|
100
|
-
|
|
101
98
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
|
102
99
|
"""Decorator that retries the function if it fails.
|
|
103
100
|
|
|
@@ -136,6 +133,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
|
|
|
136
133
|
logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
|
|
137
134
|
|
|
138
135
|
|
|
136
|
+
class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
|
|
137
|
+
"""ThreadPoolExecutor that kills children processes on exit."""
|
|
138
|
+
|
|
139
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
140
|
+
# ssh command runner eventually calls
|
|
141
|
+
# log_lib.run_with_log, which will spawn
|
|
142
|
+
# subprocesses. If we are exiting the context
|
|
143
|
+
# we need to kill the children processes
|
|
144
|
+
# to avoid leakage.
|
|
145
|
+
subprocess_utils.kill_children_processes()
|
|
146
|
+
self.shutdown()
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
|
|
139
150
|
def _parallel_ssh_with_cache(func,
|
|
140
151
|
cluster_name: str,
|
|
141
152
|
stage_name: str,
|
|
@@ -148,7 +159,7 @@ def _parallel_ssh_with_cache(func,
|
|
|
148
159
|
# as 32 is too large for some machines.
|
|
149
160
|
max_workers = subprocess_utils.get_parallel_threads(
|
|
150
161
|
cluster_info.provider_name)
|
|
151
|
-
with
|
|
162
|
+
with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
152
163
|
results = []
|
|
153
164
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
154
165
|
cluster_info, **ssh_credentials)
|
|
@@ -317,7 +328,7 @@ def ray_head_start_command(custom_resource: Optional[str],
|
|
|
317
328
|
# the warning when the worker count is >12x CPUs.
|
|
318
329
|
'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
|
|
319
330
|
f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
|
|
320
|
-
_RAY_PRLIMIT +
|
|
331
|
+
_RAY_PRLIMIT + DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
|
|
321
332
|
return cmd
|
|
322
333
|
|
|
323
334
|
|
|
@@ -425,8 +436,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
425
436
|
# use the external IP of the head node.
|
|
426
437
|
use_external_ip = cluster_info.custom_ray_options.pop(
|
|
427
438
|
'use_external_ip', False)
|
|
428
|
-
|
|
429
|
-
|
|
439
|
+
|
|
440
|
+
if use_external_ip:
|
|
441
|
+
head_ip = head_instance.external_ip
|
|
442
|
+
else:
|
|
443
|
+
# For Kubernetes, use the internal service address of the head node.
|
|
444
|
+
# Keep this consistent with the logic in kubernetes-ray.yml.j2
|
|
445
|
+
if head_instance.internal_svc:
|
|
446
|
+
head_ip = head_instance.internal_svc
|
|
447
|
+
else:
|
|
448
|
+
head_ip = head_instance.internal_ip
|
|
430
449
|
|
|
431
450
|
ray_cmd = ray_worker_start_command(custom_resource,
|
|
432
451
|
cluster_info.custom_ray_options,
|
|
@@ -468,11 +487,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
468
487
|
@common.log_function_start_end
|
|
469
488
|
@_auto_retry()
|
|
470
489
|
@timeline.event
|
|
471
|
-
def start_skylet_on_head_node(
|
|
472
|
-
|
|
473
|
-
|
|
490
|
+
def start_skylet_on_head_node(
|
|
491
|
+
cluster_name: resources_utils.ClusterName,
|
|
492
|
+
cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
|
|
493
|
+
launched_resources: resources_lib.Resources) -> None:
|
|
474
494
|
"""Start skylet on the head node."""
|
|
475
|
-
|
|
495
|
+
# Avoid circular import.
|
|
496
|
+
# pylint: disable=import-outside-toplevel
|
|
497
|
+
from sky.utils import controller_utils
|
|
498
|
+
|
|
499
|
+
def _set_skypilot_env_var_cmd() -> str:
|
|
500
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
|
501
|
+
env_vars = {
|
|
502
|
+
k: str(v) for (k, v) in env_options.Options.all_options().items()
|
|
503
|
+
}
|
|
504
|
+
is_controller = controller_utils.Controllers.from_name(
|
|
505
|
+
cluster_name.display_name) is not None
|
|
506
|
+
is_kubernetes = cluster_info.provider_name == 'kubernetes'
|
|
507
|
+
if is_controller and is_kubernetes:
|
|
508
|
+
# For jobs/serve controller, we pass in the CPU and memory limits
|
|
509
|
+
# when starting the skylet to handle cases where these env vars
|
|
510
|
+
# are not set on the cluster's pod spec. The skylet will read
|
|
511
|
+
# these env vars when starting (ManagedJobEvent.start()) and write
|
|
512
|
+
# it to disk.
|
|
513
|
+
resources = launched_resources.assert_launchable()
|
|
514
|
+
vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
|
|
515
|
+
resources.instance_type)
|
|
516
|
+
if vcpus is not None:
|
|
517
|
+
env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
|
|
518
|
+
if mem is not None:
|
|
519
|
+
env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
|
|
520
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
521
|
+
|
|
476
522
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
477
523
|
cluster_info, **ssh_credentials)
|
|
478
524
|
head_runner = runners[0]
|
|
@@ -13,4 +13,6 @@ from sky.provision.kubernetes.network import open_ports
|
|
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
|
14
14
|
from sky.provision.kubernetes.volume import apply_volume
|
|
15
15
|
from sky.provision.kubernetes.volume import delete_volume
|
|
16
|
+
from sky.provision.kubernetes.volume import get_all_volumes_usedby
|
|
16
17
|
from sky.provision.kubernetes.volume import get_volume_usedby
|
|
18
|
+
from sky.provision.kubernetes.volume import map_all_volumes_usedby
|
|
@@ -3,13 +3,11 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
from typing import Any, Dict, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
10
|
-
from sky.provision.kubernetes import network_utils
|
|
11
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
-
from sky.utils import kubernetes_enums
|
|
13
11
|
from sky.utils import yaml_utils
|
|
14
12
|
|
|
15
13
|
logger = logging.getLogger(__name__)
|
|
@@ -28,11 +26,6 @@ def bootstrap_instances(
|
|
|
28
26
|
|
|
29
27
|
_configure_services(namespace, context, config.provider_config)
|
|
30
28
|
|
|
31
|
-
networking_mode = network_utils.get_networking_mode(
|
|
32
|
-
config.provider_config.get('networking_mode'), context)
|
|
33
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
34
|
-
config = _configure_ssh_jump(namespace, context, config)
|
|
35
|
-
|
|
36
29
|
requested_service_account = config.node_config['spec']['serviceAccountName']
|
|
37
30
|
if (requested_service_account ==
|
|
38
31
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
|
@@ -481,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
|
|
|
481
474
|
f'{created_msg(binding_field, name)}')
|
|
482
475
|
|
|
483
476
|
|
|
484
|
-
def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
|
|
485
|
-
"""Creates a SSH jump pod to connect to the cluster.
|
|
486
|
-
|
|
487
|
-
Also updates config['auth']['ssh_proxy_command'] to use the newly created
|
|
488
|
-
jump pod.
|
|
489
|
-
"""
|
|
490
|
-
provider_config = config.provider_config
|
|
491
|
-
pod_cfg = config.node_config
|
|
492
|
-
|
|
493
|
-
ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
|
|
494
|
-
ssh_jump_image = provider_config['ssh_jump_image']
|
|
495
|
-
|
|
496
|
-
volumes = pod_cfg['spec']['volumes']
|
|
497
|
-
# find 'secret-volume' and get the secret name
|
|
498
|
-
secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
|
|
499
|
-
volumes))
|
|
500
|
-
ssh_key_secret_name = secret_volume['secret']['secretName']
|
|
501
|
-
|
|
502
|
-
# TODO(romilb): We currently split SSH jump pod and svc creation. Service
|
|
503
|
-
# is first created in authentication.py::setup_kubernetes_authentication
|
|
504
|
-
# and then SSH jump pod creation happens here. This is because we need to
|
|
505
|
-
# set the ssh_proxy_command in the ray YAML before we pass it to the
|
|
506
|
-
# autoscaler. If in the future if we can write the ssh_proxy_command to the
|
|
507
|
-
# cluster yaml through this method, then we should move the service
|
|
508
|
-
# creation here.
|
|
509
|
-
|
|
510
|
-
# TODO(romilb): We should add a check here to make sure the service is up
|
|
511
|
-
# and available before we create the SSH jump pod. If for any reason the
|
|
512
|
-
# service is missing, we should raise an error.
|
|
513
|
-
|
|
514
|
-
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
|
|
515
|
-
ssh_key_secret_name, namespace, context)
|
|
516
|
-
return config
|
|
517
|
-
|
|
518
|
-
|
|
519
477
|
def _configure_skypilot_system_namespace(
|
|
520
478
|
provider_config: Dict[str, Any]) -> None:
|
|
521
479
|
"""Creates the namespace for skypilot-system mounting if it does not exist.
|
|
@@ -666,4 +624,9 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
|
666
624
|
|
|
667
625
|
|
|
668
626
|
class KubernetesError(Exception):
|
|
669
|
-
|
|
627
|
+
|
|
628
|
+
def __init__(self,
|
|
629
|
+
*args,
|
|
630
|
+
insufficent_resources: Optional[List[str]] = None):
|
|
631
|
+
self.insufficent_resources = insufficent_resources
|
|
632
|
+
super().__init__(*args)
|
|
@@ -18,7 +18,6 @@ SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
|
|
|
18
18
|
|
|
19
19
|
# Labels for the Pods created by SkyPilot
|
|
20
20
|
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
21
|
-
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
|
22
21
|
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
|
23
22
|
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
|
24
23
|
|