skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -49,6 +46,10 @@ provider:
|
|
|
49
46
|
# Used to set up the necessary permissions and sidecars.
|
|
50
47
|
fuse_device_required: {{k8s_fuse_device_required}}
|
|
51
48
|
|
|
49
|
+
{% if ephemeral_volume_mounts %}
|
|
50
|
+
ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
|
|
51
|
+
{% endif %}
|
|
52
|
+
|
|
52
53
|
# ServiceAccount created by the autoscaler for the head node pod that it
|
|
53
54
|
# runs in. If this field isn't provided, the head pod config below must
|
|
54
55
|
# contain a user-created service account with the proper permissions.
|
|
@@ -212,7 +213,9 @@ provider:
|
|
|
212
213
|
metadata:
|
|
213
214
|
labels:
|
|
214
215
|
parent: skypilot
|
|
216
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
215
217
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
218
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
216
219
|
skypilot-user: {{ user }}
|
|
217
220
|
name: {{cluster_name_on_cloud}}-head-ssh
|
|
218
221
|
spec:
|
|
@@ -230,7 +233,9 @@ provider:
|
|
|
230
233
|
metadata:
|
|
231
234
|
labels:
|
|
232
235
|
parent: skypilot
|
|
236
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
233
237
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
238
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
234
239
|
skypilot-user: {{ user }}
|
|
235
240
|
# NOTE: If you're running multiple Ray clusters with services
|
|
236
241
|
# on one Kubernetes cluster, they must have unique service
|
|
@@ -250,7 +255,9 @@ provider:
|
|
|
250
255
|
metadata:
|
|
251
256
|
labels:
|
|
252
257
|
parent: skypilot
|
|
258
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
253
259
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
260
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
254
261
|
skypilot-user: {{ user }}
|
|
255
262
|
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
256
263
|
spec:
|
|
@@ -275,9 +282,8 @@ available_node_types:
|
|
|
275
282
|
labels:
|
|
276
283
|
parent: skypilot
|
|
277
284
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
285
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
278
286
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
279
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
280
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
281
287
|
skypilot-user: {{ user }}
|
|
282
288
|
# Custom tags for the pods
|
|
283
289
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -444,9 +450,6 @@ available_node_types:
|
|
|
444
450
|
# object store. If you do not provide this, Ray will fall back to
|
|
445
451
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
446
452
|
volumes:
|
|
447
|
-
- name: secret-volume
|
|
448
|
-
secret:
|
|
449
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
450
453
|
- name: dshm
|
|
451
454
|
emptyDir:
|
|
452
455
|
medium: Memory
|
|
@@ -510,6 +513,24 @@ available_node_types:
|
|
|
510
513
|
valueFrom:
|
|
511
514
|
fieldRef:
|
|
512
515
|
fieldPath: metadata.labels['ray-node-type']
|
|
516
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
517
|
+
valueFrom:
|
|
518
|
+
resourceFieldRef:
|
|
519
|
+
containerName: ray-node
|
|
520
|
+
resource: requests.cpu
|
|
521
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
522
|
+
valueFrom:
|
|
523
|
+
resourceFieldRef:
|
|
524
|
+
containerName: ray-node
|
|
525
|
+
resource: requests.memory
|
|
526
|
+
# Disable Ray memory monitor to prevent Ray's memory manager
|
|
527
|
+
# from interfering with kubernetes resource manager.
|
|
528
|
+
# If ray memory monitor is enabled, the ray memory monitor kills
|
|
529
|
+
# the running job is the job uses more than 95% of allocated memory,
|
|
530
|
+
# even if the job is not misbehaving or using its full allocated memory.
|
|
531
|
+
# This behavior does not give a chance for k8s scheduler to evict the pod.
|
|
532
|
+
- name: RAY_memory_monitor_refresh_ms
|
|
533
|
+
value: "0"
|
|
513
534
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
514
535
|
- name: {{ key }}
|
|
515
536
|
value: {{ value }}
|
|
@@ -630,12 +651,17 @@ available_node_types:
|
|
|
630
651
|
command: ["/bin/bash", "-c", "--"]
|
|
631
652
|
args:
|
|
632
653
|
- |
|
|
633
|
-
#
|
|
634
|
-
#
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
#
|
|
638
|
-
|
|
654
|
+
# Set -x to print the commands and their arguments as they are executed.
|
|
655
|
+
# Useful for debugging.
|
|
656
|
+
set -x
|
|
657
|
+
|
|
658
|
+
# Execute user-provided post-provision runcmd
|
|
659
|
+
# before any of the SkyPilot setup commands.
|
|
660
|
+
{%- if runcmd %}
|
|
661
|
+
{%- for cmd in runcmd %}
|
|
662
|
+
{{cmd}}
|
|
663
|
+
{%- endfor %}
|
|
664
|
+
{%- endif %}
|
|
639
665
|
|
|
640
666
|
# Helper function to conditionally use sudo
|
|
641
667
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
@@ -647,15 +673,125 @@ available_node_types:
|
|
|
647
673
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
|
648
674
|
(
|
|
649
675
|
(
|
|
650
|
-
#
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
676
|
+
# Helper: run apt-get update with retries
|
|
677
|
+
apt_update_with_retries() {
|
|
678
|
+
# do not fail the whole shell; we handle return codes
|
|
679
|
+
set +e
|
|
680
|
+
local log=/tmp/apt-update.log
|
|
681
|
+
local tries=3
|
|
682
|
+
local delay=1
|
|
683
|
+
local i
|
|
684
|
+
for i in $(seq 1 $tries); do
|
|
685
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
|
|
686
|
+
echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
|
|
687
|
+
sleep $delay
|
|
688
|
+
delay=$((delay * 2))
|
|
689
|
+
done
|
|
690
|
+
set -e
|
|
691
|
+
return 1
|
|
692
|
+
}
|
|
693
|
+
apt_install_with_retries() {
|
|
694
|
+
local packages="$@"
|
|
695
|
+
[ -z "$packages" ] && return 0
|
|
696
|
+
set +e
|
|
697
|
+
local log=/tmp/apt-update.log
|
|
698
|
+
local tries=3
|
|
699
|
+
local delay=1
|
|
700
|
+
local i
|
|
701
|
+
for i in $(seq 1 $tries); do
|
|
702
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
|
|
703
|
+
echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
|
|
704
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
|
|
705
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
|
|
706
|
+
sleep $delay
|
|
707
|
+
delay=$((delay * 2))
|
|
708
|
+
done
|
|
709
|
+
set -e
|
|
710
|
+
return 1
|
|
711
|
+
}
|
|
712
|
+
apt_update_install_with_retries() {
|
|
713
|
+
apt_update_with_retries
|
|
714
|
+
apt_install_with_retries "$@"
|
|
715
|
+
}
|
|
716
|
+
backup_dir=/etc/apt/sources.list.backup_skypilot
|
|
717
|
+
backup_source() {
|
|
718
|
+
$(prefix_cmd) mkdir -p "$backup_dir"
|
|
719
|
+
if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
|
|
720
|
+
$(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
|
|
721
|
+
fi
|
|
722
|
+
}
|
|
723
|
+
restore_source() {
|
|
724
|
+
if [ -f "$backup_dir/sources.list" ]; then
|
|
725
|
+
$(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
|
|
726
|
+
fi
|
|
727
|
+
}
|
|
728
|
+
update_apt_sources() {
|
|
729
|
+
local host=$1
|
|
730
|
+
local apt_file=$2
|
|
731
|
+
$(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
|
|
732
|
+
}
|
|
733
|
+
# Helper: install packages across mirrors with retries
|
|
734
|
+
apt_install_with_mirrors() {
|
|
735
|
+
local required=$1; shift
|
|
736
|
+
local packages="$@"
|
|
737
|
+
[ -z "$packages" ] && return 0
|
|
738
|
+
set +e
|
|
739
|
+
# Install packages with default sources first
|
|
740
|
+
local log=/tmp/apt-update.log
|
|
741
|
+
echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
|
|
742
|
+
restore_source
|
|
743
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
744
|
+
echo "Install failed with default sources: $packages" >> "$log"
|
|
745
|
+
# Detect distro (ubuntu/debian)
|
|
746
|
+
local APT_OS="unknown"
|
|
747
|
+
if [ -f /etc/os-release ]; then
|
|
748
|
+
. /etc/os-release
|
|
749
|
+
case "$ID" in
|
|
750
|
+
debian) APT_OS="debian" ;;
|
|
751
|
+
ubuntu) APT_OS="ubuntu" ;;
|
|
752
|
+
*)
|
|
753
|
+
if [ -n "$ID_LIKE" ]; then
|
|
754
|
+
case " $ID $ID_LIKE " in
|
|
755
|
+
*ubuntu*) APT_OS="ubuntu" ;;
|
|
756
|
+
*debian*) APT_OS="debian" ;;
|
|
757
|
+
esac
|
|
758
|
+
fi
|
|
759
|
+
;;
|
|
760
|
+
esac
|
|
761
|
+
fi
|
|
762
|
+
# Build mirror candidates
|
|
763
|
+
# deb.debian.org is a CDN endpoint, if one backend goes down,
|
|
764
|
+
# the CDN automatically fails over to another mirror,
|
|
765
|
+
# so we only retry for ubuntu here.
|
|
766
|
+
if [ "$APT_OS" = "ubuntu" ]; then
|
|
767
|
+
# Backup current sources once
|
|
768
|
+
backup_source
|
|
769
|
+
# Selected from https://launchpad.net/ubuntu/+archivemirrors
|
|
770
|
+
# and results from apt-select
|
|
771
|
+
local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
|
|
772
|
+
for host in $MIRROR_CANDIDATES; do
|
|
773
|
+
echo "Trying APT mirror ($APT_OS): $host" >> "$log"
|
|
774
|
+
if [ -f /etc/apt/sources.list ]; then
|
|
775
|
+
update_apt_sources $host /etc/apt/sources.list
|
|
776
|
+
else
|
|
777
|
+
echo "Error: /etc/apt/sources.list not found" >> "$log"
|
|
778
|
+
break
|
|
779
|
+
fi
|
|
780
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
781
|
+
echo "Install failed with mirror ($APT_OS): $host" >> "$log"
|
|
782
|
+
# Restore to default sources
|
|
783
|
+
restore_source
|
|
784
|
+
done
|
|
785
|
+
fi
|
|
786
|
+
set -e
|
|
787
|
+
if [ "$required" = "1" ]; then
|
|
788
|
+
echo "Error: required package install failed across all mirrors: $packages" >> "$log"
|
|
789
|
+
return 1
|
|
790
|
+
else
|
|
791
|
+
echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
|
|
792
|
+
return 0
|
|
793
|
+
fi
|
|
794
|
+
}
|
|
659
795
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
|
660
796
|
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
|
661
797
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
|
@@ -682,7 +818,7 @@ available_node_types:
|
|
|
682
818
|
done;
|
|
683
819
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
684
820
|
echo "Installing core packages: $INSTALL_FIRST";
|
|
685
|
-
|
|
821
|
+
apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
|
|
686
822
|
fi;
|
|
687
823
|
# SSH and other packages are not necessary, so we disable set -e
|
|
688
824
|
set +e
|
|
@@ -706,7 +842,8 @@ available_node_types:
|
|
|
706
842
|
fi
|
|
707
843
|
$(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
|
|
708
844
|
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
|
709
|
-
|
|
845
|
+
# "|| true" because fusermount3 is not always available
|
|
846
|
+
FUSERMOUNT3_PATH=$(which fusermount3) || true
|
|
710
847
|
if [ -z "$FUSERMOUNT3_PATH" ]; then
|
|
711
848
|
FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
|
|
712
849
|
fi
|
|
@@ -748,18 +885,23 @@ available_node_types:
|
|
|
748
885
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
749
886
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
750
887
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
751
|
-
$(prefix_cmd) cat
|
|
888
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
889
|
+
skypilot:ssh_public_key_content
|
|
890
|
+
SKYPILOT_SSH_KEY_EOF
|
|
752
891
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
753
892
|
$(prefix_cmd) service ssh restart;
|
|
754
893
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
755
894
|
|
|
756
895
|
touch /tmp/apt_ssh_setup_complete
|
|
757
896
|
echo "=== SSH setup completed ==="
|
|
758
|
-
) > /tmp/${STEPS[0]}.log 2>&1
|
|
759
|
-
|
|
897
|
+
) > /tmp/${STEPS[0]}.log 2>&1
|
|
898
|
+
if [ "$?" -ne "0" ]; then
|
|
899
|
+
{
|
|
900
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
|
|
760
901
|
cat /tmp/${STEPS[0]}.log
|
|
761
902
|
exit 1
|
|
762
|
-
|
|
903
|
+
}
|
|
904
|
+
fi
|
|
763
905
|
) &
|
|
764
906
|
|
|
765
907
|
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
|
@@ -777,15 +919,20 @@ available_node_types:
|
|
|
777
919
|
{{ conda_installation_commands }}
|
|
778
920
|
{{ ray_installation_commands }}
|
|
779
921
|
|
|
780
|
-
|
|
922
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
923
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
924
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
781
925
|
# Wait for `patch` package to be installed before applying ray patches
|
|
782
926
|
until dpkg -l | grep -q "^ii patch "; do
|
|
783
927
|
sleep 0.1
|
|
784
928
|
echo "Waiting for patch package to be installed..."
|
|
785
929
|
done
|
|
786
930
|
# Apply Ray patches for progress bar fix
|
|
787
|
-
|
|
788
|
-
|
|
931
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
932
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
933
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
934
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
935
|
+
env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
789
936
|
}
|
|
790
937
|
touch /tmp/ray_skypilot_installation_complete
|
|
791
938
|
echo "=== Ray and skypilot installation completed ==="
|
|
@@ -814,11 +961,14 @@ available_node_types:
|
|
|
814
961
|
set +e
|
|
815
962
|
{{ ray_worker_start_command }}
|
|
816
963
|
fi
|
|
817
|
-
) > /tmp/${STEPS[1]}.log 2>&1
|
|
818
|
-
|
|
964
|
+
) > /tmp/${STEPS[1]}.log 2>&1
|
|
965
|
+
if [ "$?" -ne "0" ]; then
|
|
966
|
+
{
|
|
967
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
|
|
819
968
|
cat /tmp/${STEPS[1]}.log
|
|
820
969
|
exit 1
|
|
821
|
-
|
|
970
|
+
}
|
|
971
|
+
fi
|
|
822
972
|
) &
|
|
823
973
|
|
|
824
974
|
|
|
@@ -836,11 +986,14 @@ available_node_types:
|
|
|
836
986
|
fi;
|
|
837
987
|
fi;
|
|
838
988
|
export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
|
839
|
-
) > /tmp/${STEPS[2]}.log 2>&1
|
|
840
|
-
|
|
989
|
+
) > /tmp/${STEPS[2]}.log 2>&1
|
|
990
|
+
if [ "$?" -ne "0" ]; then
|
|
991
|
+
{
|
|
992
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
|
|
841
993
|
cat /tmp/${STEPS[2]}.log
|
|
842
994
|
exit 1
|
|
843
|
-
|
|
995
|
+
}
|
|
996
|
+
fi
|
|
844
997
|
) &
|
|
845
998
|
|
|
846
999
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
|
@@ -927,7 +1080,7 @@ available_node_types:
|
|
|
927
1080
|
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
928
1081
|
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
929
1082
|
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
930
|
-
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.
|
|
1083
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
931
1084
|
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
932
1085
|
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
933
1086
|
fi
|
|
@@ -957,6 +1110,8 @@ available_node_types:
|
|
|
957
1110
|
|
|
958
1111
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
|
959
1112
|
{% endif %}
|
|
1113
|
+
# Set +x to stop printing the commands and their arguments as they are executed.
|
|
1114
|
+
set +x
|
|
960
1115
|
|
|
961
1116
|
trap : TERM INT; log_tail || sleep infinity & wait
|
|
962
1117
|
|
|
@@ -970,9 +1125,6 @@ available_node_types:
|
|
|
970
1125
|
# object store. If you do not provide this, Ray will fall back to
|
|
971
1126
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
972
1127
|
volumeMounts:
|
|
973
|
-
- name: secret-volume
|
|
974
|
-
readOnly: true
|
|
975
|
-
mountPath: "/etc/secret-volume"
|
|
976
1128
|
- mountPath: /dev/shm
|
|
977
1129
|
name: dshm
|
|
978
1130
|
{% if k8s_enable_gpudirect_tcpx %}
|
|
@@ -1204,24 +1356,21 @@ setup_commands:
|
|
|
1204
1356
|
start_epoch=$(date +%s);
|
|
1205
1357
|
|
|
1206
1358
|
# Wait for SSH setup to complete before proceeding
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1212
|
-
fi
|
|
1359
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
|
1360
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1361
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1362
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1213
1363
|
|
|
1214
1364
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1220
|
-
fi
|
|
1365
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1366
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1367
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1368
|
+
|
|
1221
1369
|
end_epoch=$(date +%s);
|
|
1222
1370
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1223
1371
|
start_epoch=$(date +%s);
|
|
1224
1372
|
{{ skypilot_wheel_installation_commands }}
|
|
1373
|
+
{{ copy_skypilot_templates_commands }}
|
|
1225
1374
|
end_epoch=$(date +%s);
|
|
1226
1375
|
echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1227
1376
|
start_epoch=$(date +%s);
|
sky/templates/lambda-ray.yml.j2
CHANGED
|
@@ -91,6 +91,7 @@ setup_commands:
|
|
|
91
91
|
rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
|
|
92
92
|
{{ conda_installation_commands }}
|
|
93
93
|
{{ ray_skypilot_installation_commands }}
|
|
94
|
+
{{ copy_skypilot_templates_commands }}
|
|
94
95
|
touch ~/.sudo_as_admin_successful;
|
|
95
96
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
96
97
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -10,6 +10,7 @@ provider:
|
|
|
10
10
|
module: sky.provision.nebius
|
|
11
11
|
region: "{{region}}"
|
|
12
12
|
use_internal_ips: {{use_internal_ips}}
|
|
13
|
+
use_static_ip_address: {{ use_static_ip_address }}
|
|
13
14
|
|
|
14
15
|
{%- if docker_image is not none %}
|
|
15
16
|
docker:
|
|
@@ -150,11 +151,13 @@ setup_commands:
|
|
|
150
151
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
151
152
|
{{ conda_installation_commands }}
|
|
152
153
|
{{ ray_skypilot_installation_commands }}
|
|
154
|
+
{{ copy_skypilot_templates_commands }}
|
|
153
155
|
{%- if env_vars is defined %}
|
|
154
156
|
{%- for env_var, env_value in env_vars.items() %}
|
|
155
157
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
156
158
|
{%- endfor %}
|
|
157
159
|
{%- endif %}
|
|
160
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
158
161
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
159
162
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
160
163
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
sky/templates/oci-ray.yml.j2
CHANGED
|
@@ -85,6 +85,7 @@ setup_commands:
|
|
|
85
85
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
86
86
|
{{ conda_installation_commands }}
|
|
87
87
|
{{ ray_skypilot_installation_commands }}
|
|
88
|
+
{{ copy_skypilot_templates_commands }}
|
|
88
89
|
touch ~/.sudo_as_admin_successful;
|
|
89
90
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
90
91
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
@@ -87,6 +87,7 @@ setup_commands:
|
|
|
87
87
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
88
88
|
{{ conda_installation_commands }}
|
|
89
89
|
{{ ray_skypilot_installation_commands }}
|
|
90
|
+
{{ copy_skypilot_templates_commands }}
|
|
90
91
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
91
92
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
92
93
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.primeintellect
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
zones: "{{zones}}"
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: skypilot:ssh_user
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
|
|
18
|
+
available_node_types:
|
|
19
|
+
ray_head_default:
|
|
20
|
+
resources: {}
|
|
21
|
+
node_config:
|
|
22
|
+
InstanceType: {{instance_type}}
|
|
23
|
+
DiskSize: {{disk_size}}
|
|
24
|
+
ImageId: {{image_id}}
|
|
25
|
+
PublicKey: |-
|
|
26
|
+
skypilot:ssh_public_key_content
|
|
27
|
+
|
|
28
|
+
head_node_type: ray_head_default
|
|
29
|
+
|
|
30
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
31
|
+
file_mounts: {
|
|
32
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
33
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
34
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
35
|
+
"{{remote_path}}": "{{local_path}}",
|
|
36
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
37
|
+
{%- endfor %}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
rsync_exclude: []
|
|
41
|
+
|
|
42
|
+
initialization_commands: []
|
|
43
|
+
|
|
44
|
+
# List of shell commands to run to set up nodes.
|
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
47
|
+
# items!
|
|
48
|
+
#
|
|
49
|
+
# Increment the following for catching performance bugs easier:
|
|
50
|
+
# current num items (num SSH connections): 1
|
|
51
|
+
setup_commands:
|
|
52
|
+
# Disable unattended-upgrades and handle apt-get locks
|
|
53
|
+
# Install patch utility for Ray
|
|
54
|
+
# Install conda and Ray
|
|
55
|
+
# Set system limits for Ray performance (nofile and TasksMax)
|
|
56
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
57
|
+
{{ initial_setup_command }}
|
|
58
|
+
{%- endfor %}
|
|
59
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
60
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
61
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
|
62
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
|
63
|
+
sudo pkill -9 apt-get;
|
|
64
|
+
sudo pkill -9 dpkg;
|
|
65
|
+
sudo dpkg --configure -a;
|
|
66
|
+
which patch > /dev/null || sudo apt install -y patch;
|
|
67
|
+
{{ conda_installation_commands }}
|
|
68
|
+
{{ ray_skypilot_installation_commands }}
|
|
69
|
+
{{ copy_skypilot_templates_commands }}
|
|
70
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
71
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
72
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/runpod-ray.yml.j2
CHANGED
|
@@ -93,6 +93,7 @@ setup_commands:
|
|
|
93
93
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
94
94
|
{{ conda_installation_commands }}
|
|
95
95
|
{{ ray_skypilot_installation_commands }}
|
|
96
|
+
{{ copy_skypilot_templates_commands }}
|
|
96
97
|
touch ~/.sudo_as_admin_successful;
|
|
97
98
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
98
99
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/scp-ray.yml.j2
CHANGED
|
@@ -56,6 +56,7 @@ setup_commands:
|
|
|
56
56
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
57
57
|
{{ conda_installation_commands }}
|
|
58
58
|
{{ ray_skypilot_installation_commands }}
|
|
59
|
+
{{ copy_skypilot_templates_commands }}
|
|
59
60
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
60
61
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
61
62
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|