skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -49,6 +46,10 @@ provider:
|
|
|
49
46
|
# Used to set up the necessary permissions and sidecars.
|
|
50
47
|
fuse_device_required: {{k8s_fuse_device_required}}
|
|
51
48
|
|
|
49
|
+
{% if ephemeral_volume_mounts %}
|
|
50
|
+
ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
|
|
51
|
+
{% endif %}
|
|
52
|
+
|
|
52
53
|
# ServiceAccount created by the autoscaler for the head node pod that it
|
|
53
54
|
# runs in. If this field isn't provided, the head pod config below must
|
|
54
55
|
# contain a user-created service account with the proper permissions.
|
|
@@ -212,7 +213,9 @@ provider:
|
|
|
212
213
|
metadata:
|
|
213
214
|
labels:
|
|
214
215
|
parent: skypilot
|
|
216
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
215
217
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
218
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
216
219
|
skypilot-user: {{ user }}
|
|
217
220
|
name: {{cluster_name_on_cloud}}-head-ssh
|
|
218
221
|
spec:
|
|
@@ -230,7 +233,9 @@ provider:
|
|
|
230
233
|
metadata:
|
|
231
234
|
labels:
|
|
232
235
|
parent: skypilot
|
|
236
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
233
237
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
238
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
234
239
|
skypilot-user: {{ user }}
|
|
235
240
|
# NOTE: If you're running multiple Ray clusters with services
|
|
236
241
|
# on one Kubernetes cluster, they must have unique service
|
|
@@ -250,7 +255,9 @@ provider:
|
|
|
250
255
|
metadata:
|
|
251
256
|
labels:
|
|
252
257
|
parent: skypilot
|
|
258
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
253
259
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
260
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
254
261
|
skypilot-user: {{ user }}
|
|
255
262
|
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
256
263
|
spec:
|
|
@@ -275,9 +282,8 @@ available_node_types:
|
|
|
275
282
|
labels:
|
|
276
283
|
parent: skypilot
|
|
277
284
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
285
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
278
286
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
279
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
280
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
281
287
|
skypilot-user: {{ user }}
|
|
282
288
|
# Custom tags for the pods
|
|
283
289
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -444,9 +450,6 @@ available_node_types:
|
|
|
444
450
|
# object store. If you do not provide this, Ray will fall back to
|
|
445
451
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
446
452
|
volumes:
|
|
447
|
-
- name: secret-volume
|
|
448
|
-
secret:
|
|
449
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
450
453
|
- name: dshm
|
|
451
454
|
emptyDir:
|
|
452
455
|
medium: Memory
|
|
@@ -510,6 +513,16 @@ available_node_types:
|
|
|
510
513
|
valueFrom:
|
|
511
514
|
fieldRef:
|
|
512
515
|
fieldPath: metadata.labels['ray-node-type']
|
|
516
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
517
|
+
valueFrom:
|
|
518
|
+
resourceFieldRef:
|
|
519
|
+
containerName: ray-node
|
|
520
|
+
resource: requests.cpu
|
|
521
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
522
|
+
valueFrom:
|
|
523
|
+
resourceFieldRef:
|
|
524
|
+
containerName: ray-node
|
|
525
|
+
resource: requests.memory
|
|
513
526
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
514
527
|
- name: {{ key }}
|
|
515
528
|
value: {{ value }}
|
|
@@ -630,12 +643,17 @@ available_node_types:
|
|
|
630
643
|
command: ["/bin/bash", "-c", "--"]
|
|
631
644
|
args:
|
|
632
645
|
- |
|
|
633
|
-
#
|
|
634
|
-
#
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
#
|
|
638
|
-
|
|
646
|
+
# Set -x to print the commands and their arguments as they are executed.
|
|
647
|
+
# Useful for debugging.
|
|
648
|
+
set -x
|
|
649
|
+
|
|
650
|
+
# Execute user-provided post-provision runcmd
|
|
651
|
+
# before any of the SkyPilot setup commands.
|
|
652
|
+
{%- if runcmd %}
|
|
653
|
+
{%- for cmd in runcmd %}
|
|
654
|
+
{{cmd}}
|
|
655
|
+
{%- endfor %}
|
|
656
|
+
{%- endif %}
|
|
639
657
|
|
|
640
658
|
# Helper function to conditionally use sudo
|
|
641
659
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
@@ -647,15 +665,125 @@ available_node_types:
|
|
|
647
665
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
|
648
666
|
(
|
|
649
667
|
(
|
|
650
|
-
#
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
668
|
+
# Helper: run apt-get update with retries
|
|
669
|
+
apt_update_with_retries() {
|
|
670
|
+
# do not fail the whole shell; we handle return codes
|
|
671
|
+
set +e
|
|
672
|
+
local log=/tmp/apt-update.log
|
|
673
|
+
local tries=3
|
|
674
|
+
local delay=1
|
|
675
|
+
local i
|
|
676
|
+
for i in $(seq 1 $tries); do
|
|
677
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
|
|
678
|
+
echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
|
|
679
|
+
sleep $delay
|
|
680
|
+
delay=$((delay * 2))
|
|
681
|
+
done
|
|
682
|
+
set -e
|
|
683
|
+
return 1
|
|
684
|
+
}
|
|
685
|
+
apt_install_with_retries() {
|
|
686
|
+
local packages="$@"
|
|
687
|
+
[ -z "$packages" ] && return 0
|
|
688
|
+
set +e
|
|
689
|
+
local log=/tmp/apt-update.log
|
|
690
|
+
local tries=3
|
|
691
|
+
local delay=1
|
|
692
|
+
local i
|
|
693
|
+
for i in $(seq 1 $tries); do
|
|
694
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
|
|
695
|
+
echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
|
|
696
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
|
|
697
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
|
|
698
|
+
sleep $delay
|
|
699
|
+
delay=$((delay * 2))
|
|
700
|
+
done
|
|
701
|
+
set -e
|
|
702
|
+
return 1
|
|
703
|
+
}
|
|
704
|
+
apt_update_install_with_retries() {
|
|
705
|
+
apt_update_with_retries
|
|
706
|
+
apt_install_with_retries "$@"
|
|
707
|
+
}
|
|
708
|
+
backup_dir=/etc/apt/sources.list.backup_skypilot
|
|
709
|
+
backup_source() {
|
|
710
|
+
$(prefix_cmd) mkdir -p "$backup_dir"
|
|
711
|
+
if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
|
|
712
|
+
$(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
|
|
713
|
+
fi
|
|
714
|
+
}
|
|
715
|
+
restore_source() {
|
|
716
|
+
if [ -f "$backup_dir/sources.list" ]; then
|
|
717
|
+
$(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
|
|
718
|
+
fi
|
|
719
|
+
}
|
|
720
|
+
update_apt_sources() {
|
|
721
|
+
local host=$1
|
|
722
|
+
local apt_file=$2
|
|
723
|
+
$(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
|
|
724
|
+
}
|
|
725
|
+
# Helper: install packages across mirrors with retries
|
|
726
|
+
apt_install_with_mirrors() {
|
|
727
|
+
local required=$1; shift
|
|
728
|
+
local packages="$@"
|
|
729
|
+
[ -z "$packages" ] && return 0
|
|
730
|
+
set +e
|
|
731
|
+
# Install packages with default sources first
|
|
732
|
+
local log=/tmp/apt-update.log
|
|
733
|
+
echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
|
|
734
|
+
restore_source
|
|
735
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
736
|
+
echo "Install failed with default sources: $packages" >> "$log"
|
|
737
|
+
# Detect distro (ubuntu/debian)
|
|
738
|
+
local APT_OS="unknown"
|
|
739
|
+
if [ -f /etc/os-release ]; then
|
|
740
|
+
. /etc/os-release
|
|
741
|
+
case "$ID" in
|
|
742
|
+
debian) APT_OS="debian" ;;
|
|
743
|
+
ubuntu) APT_OS="ubuntu" ;;
|
|
744
|
+
*)
|
|
745
|
+
if [ -n "$ID_LIKE" ]; then
|
|
746
|
+
case " $ID $ID_LIKE " in
|
|
747
|
+
*ubuntu*) APT_OS="ubuntu" ;;
|
|
748
|
+
*debian*) APT_OS="debian" ;;
|
|
749
|
+
esac
|
|
750
|
+
fi
|
|
751
|
+
;;
|
|
752
|
+
esac
|
|
753
|
+
fi
|
|
754
|
+
# Build mirror candidates
|
|
755
|
+
# deb.debian.org is a CDN endpoint, if one backend goes down,
|
|
756
|
+
# the CDN automatically fails over to another mirror,
|
|
757
|
+
# so we only retry for ubuntu here.
|
|
758
|
+
if [ "$APT_OS" = "ubuntu" ]; then
|
|
759
|
+
# Backup current sources once
|
|
760
|
+
backup_source
|
|
761
|
+
# Selected from https://launchpad.net/ubuntu/+archivemirrors
|
|
762
|
+
# and results from apt-select
|
|
763
|
+
local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
|
|
764
|
+
for host in $MIRROR_CANDIDATES; do
|
|
765
|
+
echo "Trying APT mirror ($APT_OS): $host" >> "$log"
|
|
766
|
+
if [ -f /etc/apt/sources.list ]; then
|
|
767
|
+
update_apt_sources $host /etc/apt/sources.list
|
|
768
|
+
else
|
|
769
|
+
echo "Error: /etc/apt/sources.list not found" >> "$log"
|
|
770
|
+
break
|
|
771
|
+
fi
|
|
772
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
773
|
+
echo "Install failed with mirror ($APT_OS): $host" >> "$log"
|
|
774
|
+
# Restore to default sources
|
|
775
|
+
restore_source
|
|
776
|
+
done
|
|
777
|
+
fi
|
|
778
|
+
set -e
|
|
779
|
+
if [ "$required" = "1" ]; then
|
|
780
|
+
echo "Error: required package install failed across all mirrors: $packages" >> "$log"
|
|
781
|
+
return 1
|
|
782
|
+
else
|
|
783
|
+
echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
|
|
784
|
+
return 0
|
|
785
|
+
fi
|
|
786
|
+
}
|
|
659
787
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
|
660
788
|
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
|
661
789
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
|
@@ -682,7 +810,7 @@ available_node_types:
|
|
|
682
810
|
done;
|
|
683
811
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
684
812
|
echo "Installing core packages: $INSTALL_FIRST";
|
|
685
|
-
|
|
813
|
+
apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
|
|
686
814
|
fi;
|
|
687
815
|
# SSH and other packages are not necessary, so we disable set -e
|
|
688
816
|
set +e
|
|
@@ -706,7 +834,8 @@ available_node_types:
|
|
|
706
834
|
fi
|
|
707
835
|
$(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
|
|
708
836
|
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
|
709
|
-
|
|
837
|
+
# "|| true" because fusermount3 is not always available
|
|
838
|
+
FUSERMOUNT3_PATH=$(which fusermount3) || true
|
|
710
839
|
if [ -z "$FUSERMOUNT3_PATH" ]; then
|
|
711
840
|
FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
|
|
712
841
|
fi
|
|
@@ -748,18 +877,23 @@ available_node_types:
|
|
|
748
877
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
749
878
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
750
879
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
751
|
-
$(prefix_cmd) cat
|
|
880
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
881
|
+
skypilot:ssh_public_key_content
|
|
882
|
+
SKYPILOT_SSH_KEY_EOF
|
|
752
883
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
753
884
|
$(prefix_cmd) service ssh restart;
|
|
754
885
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
755
886
|
|
|
756
887
|
touch /tmp/apt_ssh_setup_complete
|
|
757
888
|
echo "=== SSH setup completed ==="
|
|
758
|
-
) > /tmp/${STEPS[0]}.log 2>&1
|
|
759
|
-
|
|
889
|
+
) > /tmp/${STEPS[0]}.log 2>&1
|
|
890
|
+
if [ "$?" -ne "0" ]; then
|
|
891
|
+
{
|
|
892
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
|
|
760
893
|
cat /tmp/${STEPS[0]}.log
|
|
761
894
|
exit 1
|
|
762
|
-
|
|
895
|
+
}
|
|
896
|
+
fi
|
|
763
897
|
) &
|
|
764
898
|
|
|
765
899
|
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
|
@@ -777,15 +911,20 @@ available_node_types:
|
|
|
777
911
|
{{ conda_installation_commands }}
|
|
778
912
|
{{ ray_installation_commands }}
|
|
779
913
|
|
|
780
|
-
|
|
914
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
915
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
916
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
781
917
|
# Wait for `patch` package to be installed before applying ray patches
|
|
782
918
|
until dpkg -l | grep -q "^ii patch "; do
|
|
783
919
|
sleep 0.1
|
|
784
920
|
echo "Waiting for patch package to be installed..."
|
|
785
921
|
done
|
|
786
922
|
# Apply Ray patches for progress bar fix
|
|
787
|
-
|
|
788
|
-
|
|
923
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
924
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
925
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
926
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
927
|
+
env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
789
928
|
}
|
|
790
929
|
touch /tmp/ray_skypilot_installation_complete
|
|
791
930
|
echo "=== Ray and skypilot installation completed ==="
|
|
@@ -814,11 +953,14 @@ available_node_types:
|
|
|
814
953
|
set +e
|
|
815
954
|
{{ ray_worker_start_command }}
|
|
816
955
|
fi
|
|
817
|
-
) > /tmp/${STEPS[1]}.log 2>&1
|
|
818
|
-
|
|
956
|
+
) > /tmp/${STEPS[1]}.log 2>&1
|
|
957
|
+
if [ "$?" -ne "0" ]; then
|
|
958
|
+
{
|
|
959
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
|
|
819
960
|
cat /tmp/${STEPS[1]}.log
|
|
820
961
|
exit 1
|
|
821
|
-
|
|
962
|
+
}
|
|
963
|
+
fi
|
|
822
964
|
) &
|
|
823
965
|
|
|
824
966
|
|
|
@@ -836,11 +978,14 @@ available_node_types:
|
|
|
836
978
|
fi;
|
|
837
979
|
fi;
|
|
838
980
|
export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
|
839
|
-
) > /tmp/${STEPS[2]}.log 2>&1
|
|
840
|
-
|
|
981
|
+
) > /tmp/${STEPS[2]}.log 2>&1
|
|
982
|
+
if [ "$?" -ne "0" ]; then
|
|
983
|
+
{
|
|
984
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
|
|
841
985
|
cat /tmp/${STEPS[2]}.log
|
|
842
986
|
exit 1
|
|
843
|
-
|
|
987
|
+
}
|
|
988
|
+
fi
|
|
844
989
|
) &
|
|
845
990
|
|
|
846
991
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
|
@@ -927,7 +1072,7 @@ available_node_types:
|
|
|
927
1072
|
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
928
1073
|
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
929
1074
|
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
930
|
-
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.
|
|
1075
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
931
1076
|
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
932
1077
|
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
933
1078
|
fi
|
|
@@ -957,6 +1102,8 @@ available_node_types:
|
|
|
957
1102
|
|
|
958
1103
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
|
959
1104
|
{% endif %}
|
|
1105
|
+
# Set +x to stop printing the commands and their arguments as they are executed.
|
|
1106
|
+
set +x
|
|
960
1107
|
|
|
961
1108
|
trap : TERM INT; log_tail || sleep infinity & wait
|
|
962
1109
|
|
|
@@ -970,9 +1117,6 @@ available_node_types:
|
|
|
970
1117
|
# object store. If you do not provide this, Ray will fall back to
|
|
971
1118
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
972
1119
|
volumeMounts:
|
|
973
|
-
- name: secret-volume
|
|
974
|
-
readOnly: true
|
|
975
|
-
mountPath: "/etc/secret-volume"
|
|
976
1120
|
- mountPath: /dev/shm
|
|
977
1121
|
name: dshm
|
|
978
1122
|
{% if k8s_enable_gpudirect_tcpx %}
|
|
@@ -1204,24 +1348,21 @@ setup_commands:
|
|
|
1204
1348
|
start_epoch=$(date +%s);
|
|
1205
1349
|
|
|
1206
1350
|
# Wait for SSH setup to complete before proceeding
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1212
|
-
fi
|
|
1351
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
|
1352
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1353
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1354
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1213
1355
|
|
|
1214
1356
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1220
|
-
fi
|
|
1357
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1358
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1359
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1360
|
+
|
|
1221
1361
|
end_epoch=$(date +%s);
|
|
1222
1362
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1223
1363
|
start_epoch=$(date +%s);
|
|
1224
1364
|
{{ skypilot_wheel_installation_commands }}
|
|
1365
|
+
{{ copy_skypilot_templates_commands }}
|
|
1225
1366
|
end_epoch=$(date +%s);
|
|
1226
1367
|
echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1227
1368
|
start_epoch=$(date +%s);
|
sky/templates/lambda-ray.yml.j2
CHANGED
|
@@ -91,6 +91,7 @@ setup_commands:
|
|
|
91
91
|
rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
|
|
92
92
|
{{ conda_installation_commands }}
|
|
93
93
|
{{ ray_skypilot_installation_commands }}
|
|
94
|
+
{{ copy_skypilot_templates_commands }}
|
|
94
95
|
touch ~/.sudo_as_admin_successful;
|
|
95
96
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
96
97
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -10,6 +10,7 @@ provider:
|
|
|
10
10
|
module: sky.provision.nebius
|
|
11
11
|
region: "{{region}}"
|
|
12
12
|
use_internal_ips: {{use_internal_ips}}
|
|
13
|
+
use_static_ip_address: {{ use_static_ip_address }}
|
|
13
14
|
|
|
14
15
|
{%- if docker_image is not none %}
|
|
15
16
|
docker:
|
|
@@ -150,11 +151,13 @@ setup_commands:
|
|
|
150
151
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
151
152
|
{{ conda_installation_commands }}
|
|
152
153
|
{{ ray_skypilot_installation_commands }}
|
|
154
|
+
{{ copy_skypilot_templates_commands }}
|
|
153
155
|
{%- if env_vars is defined %}
|
|
154
156
|
{%- for env_var, env_value in env_vars.items() %}
|
|
155
157
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
156
158
|
{%- endfor %}
|
|
157
159
|
{%- endif %}
|
|
160
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
158
161
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
159
162
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
160
163
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
sky/templates/oci-ray.yml.j2
CHANGED
|
@@ -85,6 +85,7 @@ setup_commands:
|
|
|
85
85
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
86
86
|
{{ conda_installation_commands }}
|
|
87
87
|
{{ ray_skypilot_installation_commands }}
|
|
88
|
+
{{ copy_skypilot_templates_commands }}
|
|
88
89
|
touch ~/.sudo_as_admin_successful;
|
|
89
90
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
90
91
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
@@ -87,6 +87,7 @@ setup_commands:
|
|
|
87
87
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
88
88
|
{{ conda_installation_commands }}
|
|
89
89
|
{{ ray_skypilot_installation_commands }}
|
|
90
|
+
{{ copy_skypilot_templates_commands }}
|
|
90
91
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
91
92
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
92
93
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.primeintellect
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
zones: "{{zones}}"
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: skypilot:ssh_user
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
|
|
18
|
+
available_node_types:
|
|
19
|
+
ray_head_default:
|
|
20
|
+
resources: {}
|
|
21
|
+
node_config:
|
|
22
|
+
InstanceType: {{instance_type}}
|
|
23
|
+
DiskSize: {{disk_size}}
|
|
24
|
+
ImageId: {{image_id}}
|
|
25
|
+
PublicKey: |-
|
|
26
|
+
skypilot:ssh_public_key_content
|
|
27
|
+
|
|
28
|
+
head_node_type: ray_head_default
|
|
29
|
+
|
|
30
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
31
|
+
file_mounts: {
|
|
32
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
33
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
34
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
35
|
+
"{{remote_path}}": "{{local_path}}",
|
|
36
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
37
|
+
{%- endfor %}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
rsync_exclude: []
|
|
41
|
+
|
|
42
|
+
initialization_commands: []
|
|
43
|
+
|
|
44
|
+
# List of shell commands to run to set up nodes.
|
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
47
|
+
# items!
|
|
48
|
+
#
|
|
49
|
+
# Increment the following for catching performance bugs easier:
|
|
50
|
+
# current num items (num SSH connections): 1
|
|
51
|
+
setup_commands:
|
|
52
|
+
# Disable unattended-upgrades and handle apt-get locks
|
|
53
|
+
# Install patch utility for Ray
|
|
54
|
+
# Install conda and Ray
|
|
55
|
+
# Set system limits for Ray performance (nofile and TasksMax)
|
|
56
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
57
|
+
{{ initial_setup_command }}
|
|
58
|
+
{%- endfor %}
|
|
59
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
60
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
61
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
|
62
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
|
63
|
+
sudo pkill -9 apt-get;
|
|
64
|
+
sudo pkill -9 dpkg;
|
|
65
|
+
sudo dpkg --configure -a;
|
|
66
|
+
which patch > /dev/null || sudo apt install -y patch;
|
|
67
|
+
{{ conda_installation_commands }}
|
|
68
|
+
{{ ray_skypilot_installation_commands }}
|
|
69
|
+
{{ copy_skypilot_templates_commands }}
|
|
70
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
71
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
72
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/runpod-ray.yml.j2
CHANGED
|
@@ -93,6 +93,7 @@ setup_commands:
|
|
|
93
93
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
94
94
|
{{ conda_installation_commands }}
|
|
95
95
|
{{ ray_skypilot_installation_commands }}
|
|
96
|
+
{{ copy_skypilot_templates_commands }}
|
|
96
97
|
touch ~/.sudo_as_admin_successful;
|
|
97
98
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
98
99
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/scp-ray.yml.j2
CHANGED
|
@@ -56,6 +56,7 @@ setup_commands:
|
|
|
56
56
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
57
57
|
{{ conda_installation_commands }}
|
|
58
58
|
{{ ray_skypilot_installation_commands }}
|
|
59
|
+
{{ copy_skypilot_templates_commands }}
|
|
59
60
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
60
61
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
61
62
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|