skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -49,6 +46,10 @@ provider:
|
|
|
49
46
|
# Used to set up the necessary permissions and sidecars.
|
|
50
47
|
fuse_device_required: {{k8s_fuse_device_required}}
|
|
51
48
|
|
|
49
|
+
{% if ephemeral_volume_mounts %}
|
|
50
|
+
ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
|
|
51
|
+
{% endif %}
|
|
52
|
+
|
|
52
53
|
# ServiceAccount created by the autoscaler for the head node pod that it
|
|
53
54
|
# runs in. If this field isn't provided, the head pod config below must
|
|
54
55
|
# contain a user-created service account with the proper permissions.
|
|
@@ -212,7 +213,9 @@ provider:
|
|
|
212
213
|
metadata:
|
|
213
214
|
labels:
|
|
214
215
|
parent: skypilot
|
|
216
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
215
217
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
218
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
216
219
|
skypilot-user: {{ user }}
|
|
217
220
|
name: {{cluster_name_on_cloud}}-head-ssh
|
|
218
221
|
spec:
|
|
@@ -230,7 +233,9 @@ provider:
|
|
|
230
233
|
metadata:
|
|
231
234
|
labels:
|
|
232
235
|
parent: skypilot
|
|
236
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
233
237
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
238
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
234
239
|
skypilot-user: {{ user }}
|
|
235
240
|
# NOTE: If you're running multiple Ray clusters with services
|
|
236
241
|
# on one Kubernetes cluster, they must have unique service
|
|
@@ -243,6 +248,24 @@ provider:
|
|
|
243
248
|
# This selector must match the head node pod's selector below.
|
|
244
249
|
selector:
|
|
245
250
|
component: {{cluster_name_on_cloud}}-head
|
|
251
|
+
# Headless service mapping hostnames to rest of the worker nodes
|
|
252
|
+
{% for worker_id in range(1, num_nodes) %}
|
|
253
|
+
- apiVersion: v1
|
|
254
|
+
kind: Service
|
|
255
|
+
metadata:
|
|
256
|
+
labels:
|
|
257
|
+
parent: skypilot
|
|
258
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
259
|
+
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
260
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
261
|
+
skypilot-user: {{ user }}
|
|
262
|
+
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
263
|
+
spec:
|
|
264
|
+
selector:
|
|
265
|
+
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
266
|
+
clusterIP: None
|
|
267
|
+
{% endfor %}
|
|
268
|
+
|
|
246
269
|
|
|
247
270
|
# Specify the pod type for the ray head node (as configured below).
|
|
248
271
|
head_node_type: ray_head_default
|
|
@@ -255,13 +278,12 @@ available_node_types:
|
|
|
255
278
|
metadata:
|
|
256
279
|
# name will be filled in the provisioner
|
|
257
280
|
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
|
|
258
|
-
# service is required.
|
|
281
|
+
# service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
|
|
259
282
|
labels:
|
|
260
283
|
parent: skypilot
|
|
261
284
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
285
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
262
286
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
263
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
264
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
265
287
|
skypilot-user: {{ user }}
|
|
266
288
|
# Custom tags for the pods
|
|
267
289
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -273,14 +295,100 @@ available_node_types:
|
|
|
273
295
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
|
|
274
296
|
skypilot-binpack: "gpu"
|
|
275
297
|
{% endif %}
|
|
298
|
+
{% if k8s_kueue_local_queue_name %}
|
|
299
|
+
kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
|
|
300
|
+
kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
|
|
301
|
+
{% endif %}
|
|
302
|
+
{% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
|
|
303
|
+
annotations:
|
|
304
|
+
{% if k8s_kueue_local_queue_name %}
|
|
305
|
+
kueue.x-k8s.io/retriable-in-group: "false"
|
|
306
|
+
kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
|
|
307
|
+
{% if k8s_max_run_duration_seconds %}
|
|
308
|
+
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
|
|
309
|
+
{% endif %}
|
|
310
|
+
{% endif %}
|
|
311
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
|
|
312
|
+
# Values from google cloud guide
|
|
313
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
314
|
+
devices.gke.io/container.tcpx-daemon: |+
|
|
315
|
+
- path: /dev/nvidia0
|
|
316
|
+
- path: /dev/nvidia1
|
|
317
|
+
- path: /dev/nvidia2
|
|
318
|
+
- path: /dev/nvidia3
|
|
319
|
+
- path: /dev/nvidia4
|
|
320
|
+
- path: /dev/nvidia5
|
|
321
|
+
- path: /dev/nvidia6
|
|
322
|
+
- path: /dev/nvidia7
|
|
323
|
+
- path: /dev/nvidiactl
|
|
324
|
+
- path: /dev/nvidia-uvm
|
|
325
|
+
networking.gke.io/default-interface: 'eth0'
|
|
326
|
+
networking.gke.io/interfaces: |
|
|
327
|
+
[
|
|
328
|
+
{"interfaceName":"eth0","network":"default"},
|
|
329
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
|
330
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
|
331
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
|
332
|
+
{"interfaceName":"eth4","network":"vpc4"}
|
|
333
|
+
]
|
|
334
|
+
{% endif %}
|
|
335
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
336
|
+
devices.gke.io/container.tcpxo-daemon: |+
|
|
337
|
+
- path: /dev/nvidia0
|
|
338
|
+
- path: /dev/nvidia1
|
|
339
|
+
- path: /dev/nvidia2
|
|
340
|
+
- path: /dev/nvidia3
|
|
341
|
+
- path: /dev/nvidia4
|
|
342
|
+
- path: /dev/nvidia5
|
|
343
|
+
- path: /dev/nvidia6
|
|
344
|
+
- path: /dev/nvidia7
|
|
345
|
+
- path: /dev/nvidiactl
|
|
346
|
+
- path: /dev/nvidia-uvm
|
|
347
|
+
- path: /dev/dmabuf_import_helper
|
|
348
|
+
networking.gke.io/default-interface: 'eth0'
|
|
349
|
+
networking.gke.io/interfaces: |
|
|
350
|
+
[
|
|
351
|
+
{"interfaceName":"eth0","network":"default"},
|
|
352
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
|
353
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
|
354
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
|
355
|
+
{"interfaceName":"eth4","network":"vpc4"},
|
|
356
|
+
{"interfaceName":"eth5","network":"vpc5"},
|
|
357
|
+
{"interfaceName":"eth6","network":"vpc6"},
|
|
358
|
+
{"interfaceName":"eth7","network":"vpc7"},
|
|
359
|
+
{"interfaceName":"eth8","network":"vpc8"}
|
|
360
|
+
]
|
|
361
|
+
{% endif %}
|
|
362
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
363
|
+
networking.gke.io/default-interface: 'eth0'
|
|
364
|
+
networking.gke.io/interfaces: |
|
|
365
|
+
[
|
|
366
|
+
{"interfaceName":"eth0","network":"default"},
|
|
367
|
+
{"interfaceName":"eth1","network":"gvnic-1"},
|
|
368
|
+
{"interfaceName":"eth2","network":"rdma-0"},
|
|
369
|
+
{"interfaceName":"eth3","network":"rdma-1"},
|
|
370
|
+
{"interfaceName":"eth4","network":"rdma-2"},
|
|
371
|
+
{"interfaceName":"eth5","network":"rdma-3"},
|
|
372
|
+
{"interfaceName":"eth6","network":"rdma-4"},
|
|
373
|
+
{"interfaceName":"eth7","network":"rdma-5"},
|
|
374
|
+
{"interfaceName":"eth8","network":"rdma-6"},
|
|
375
|
+
{"interfaceName":"eth9","network":"rdma-7"}
|
|
376
|
+
]
|
|
377
|
+
{% endif %}
|
|
378
|
+
{% endif %}
|
|
276
379
|
spec:
|
|
277
380
|
# serviceAccountName: skypilot-service-account
|
|
278
381
|
serviceAccountName: {{k8s_service_account_name}}
|
|
279
382
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
|
280
383
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
|
384
|
+
{% if volume_mounts %}
|
|
385
|
+
securityContext:
|
|
386
|
+
fsGroup: 1000
|
|
387
|
+
fsGroupChangePolicy: OnRootMismatch
|
|
388
|
+
{% endif %}
|
|
281
389
|
|
|
282
390
|
# Add node selector if GPU/TPUs are requested:
|
|
283
|
-
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
|
391
|
+
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) or (k8s_enable_flex_start) %}
|
|
284
392
|
nodeSelector:
|
|
285
393
|
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
|
286
394
|
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
|
@@ -288,6 +396,9 @@ available_node_types:
|
|
|
288
396
|
{% if k8s_spot_label_key is not none %}
|
|
289
397
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
|
290
398
|
{% endif %}
|
|
399
|
+
{% if k8s_enable_flex_start %}
|
|
400
|
+
cloud.google.com/gke-flex-start: "true"
|
|
401
|
+
{% endif %}
|
|
291
402
|
{% endif %}
|
|
292
403
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) or (avoid_label_keys is not none) %}
|
|
293
404
|
affinity:
|
|
@@ -339,9 +450,6 @@ available_node_types:
|
|
|
339
450
|
# object store. If you do not provide this, Ray will fall back to
|
|
340
451
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
341
452
|
volumes:
|
|
342
|
-
- name: secret-volume
|
|
343
|
-
secret:
|
|
344
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
345
453
|
- name: dshm
|
|
346
454
|
emptyDir:
|
|
347
455
|
medium: Memory
|
|
@@ -356,19 +464,176 @@ available_node_types:
|
|
|
356
464
|
persistentVolumeClaim:
|
|
357
465
|
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
|
358
466
|
{% endif %}
|
|
467
|
+
{% for volume_mount in volume_mounts %}
|
|
468
|
+
- name: {{volume_mount.name}}
|
|
469
|
+
persistentVolumeClaim:
|
|
470
|
+
claimName: {{volume_mount.volume_name_on_cloud}}
|
|
471
|
+
{% endfor %}
|
|
472
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
473
|
+
- name: libraries
|
|
474
|
+
hostPath:
|
|
475
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
476
|
+
- name: tcpx-socket
|
|
477
|
+
emptyDir: {}
|
|
478
|
+
- name: sys
|
|
479
|
+
hostPath:
|
|
480
|
+
path: /sys
|
|
481
|
+
- name: proc-sys
|
|
482
|
+
hostPath:
|
|
483
|
+
path: /proc/sys
|
|
484
|
+
{% endif %}
|
|
485
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
486
|
+
- name: libraries
|
|
487
|
+
hostPath:
|
|
488
|
+
path: /home/kubernetes/bin/nvidia
|
|
489
|
+
- name: sys
|
|
490
|
+
hostPath:
|
|
491
|
+
path: /sys
|
|
492
|
+
- name: proc-sys
|
|
493
|
+
hostPath:
|
|
494
|
+
path: /proc/sys
|
|
495
|
+
- name: aperture-devices
|
|
496
|
+
hostPath:
|
|
497
|
+
path: /dev/aperture_devices
|
|
498
|
+
{% endif %}
|
|
499
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
500
|
+
- name: library-dir-host
|
|
501
|
+
hostPath:
|
|
502
|
+
path: /home/kubernetes/bin/nvidia
|
|
503
|
+
- name: gib
|
|
504
|
+
hostPath:
|
|
505
|
+
path: /home/kubernetes/bin/gib
|
|
506
|
+
{% endif %}
|
|
359
507
|
containers:
|
|
360
508
|
- name: ray-node
|
|
361
|
-
imagePullPolicy:
|
|
509
|
+
imagePullPolicy: Always
|
|
362
510
|
image: {{image_id}}
|
|
363
511
|
env:
|
|
364
512
|
- name: SKYPILOT_POD_NODE_TYPE
|
|
365
513
|
valueFrom:
|
|
366
514
|
fieldRef:
|
|
367
515
|
fieldPath: metadata.labels['ray-node-type']
|
|
516
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
517
|
+
valueFrom:
|
|
518
|
+
resourceFieldRef:
|
|
519
|
+
containerName: ray-node
|
|
520
|
+
resource: requests.cpu
|
|
521
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
522
|
+
valueFrom:
|
|
523
|
+
resourceFieldRef:
|
|
524
|
+
containerName: ray-node
|
|
525
|
+
resource: requests.memory
|
|
368
526
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
369
527
|
- name: {{ key }}
|
|
370
528
|
value: {{ value }}
|
|
371
529
|
{% endfor %}
|
|
530
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
|
|
531
|
+
# Page recommends setting NCCL values for GPUDirect TCPX for best performance.
|
|
532
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
533
|
+
- name: LD_LIBRARY_PATH
|
|
534
|
+
value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
|
|
535
|
+
- name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
|
|
536
|
+
value: eth1,eth2,eth3,eth4
|
|
537
|
+
- name: NCCL_GPUDIRECTTCPX_CTRL_DEV
|
|
538
|
+
value: eth0
|
|
539
|
+
- name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
|
|
540
|
+
value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
|
|
541
|
+
- name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
|
|
542
|
+
value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
|
|
543
|
+
- name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
|
|
544
|
+
value: "500000"
|
|
545
|
+
- name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
|
|
546
|
+
value: "/tmp"
|
|
547
|
+
- name: NCCL_GPUDIRECTTCPX_FORCE_ACK
|
|
548
|
+
value: "0"
|
|
549
|
+
- name: NCCL_SOCKET_IFNAME
|
|
550
|
+
value: eth0
|
|
551
|
+
- name: NCCL_CROSS_NIC
|
|
552
|
+
value: "0"
|
|
553
|
+
- name: NCCL_ALGO
|
|
554
|
+
value: Ring
|
|
555
|
+
- name: NCCL_PROTO
|
|
556
|
+
value: Simple
|
|
557
|
+
- name: NCCL_NSOCKS_PERTHREAD
|
|
558
|
+
value: "4"
|
|
559
|
+
- name: NCCL_SOCKET_NTHREADS
|
|
560
|
+
value: "1"
|
|
561
|
+
- name: NCCL_NET_GDR_LEVEL
|
|
562
|
+
value: PIX
|
|
563
|
+
- name: NCCL_DYNAMIC_CHUNK_SIZE
|
|
564
|
+
value: "524288"
|
|
565
|
+
- name: NCCL_P2P_PXN_LEVEL
|
|
566
|
+
value: "0"
|
|
567
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
|
568
|
+
value: "524288"
|
|
569
|
+
- name: NCCL_P2P_PCI_CHUNKSIZE
|
|
570
|
+
value: "524288"
|
|
571
|
+
- name: NCCL_P2P_NVL_CHUNKSIZE
|
|
572
|
+
value: "1048576"
|
|
573
|
+
- name: NCCL_BUFFSIZE
|
|
574
|
+
value: "4194304"
|
|
575
|
+
- name: NCCL_MAX_NCHANNELS
|
|
576
|
+
value: "8"
|
|
577
|
+
- name: NCCL_MIN_NCHANNELS
|
|
578
|
+
value: "8"
|
|
579
|
+
- name: CUDA_VISIBLE_DEVICES
|
|
580
|
+
value: "0,1,2,3,4,5,6,7"
|
|
581
|
+
{% endif %}
|
|
582
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
583
|
+
- name: LD_LIBRARY_PATH
|
|
584
|
+
value: /usr/local/nvidia/lib64
|
|
585
|
+
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
|
|
586
|
+
value: /dev/aperture_devices
|
|
587
|
+
- name: NCCL_FASTRAK_CTRL_DEV
|
|
588
|
+
value: eth0
|
|
589
|
+
- name: NCCL_FASTRAK_IFNAME
|
|
590
|
+
value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
|
|
591
|
+
- name: NCCL_SOCKET_IFNAME
|
|
592
|
+
value: eth0
|
|
593
|
+
- name: NCCL_CROSS_NIC
|
|
594
|
+
value: "0"
|
|
595
|
+
- name: NCCL_ALGO
|
|
596
|
+
value: Ring,Tree
|
|
597
|
+
- name: NCCL_PROTO
|
|
598
|
+
value: Simple,LL128
|
|
599
|
+
- name: NCCL_MIN_NCHANNELS
|
|
600
|
+
value: "4"
|
|
601
|
+
- name: NCCL_TUNER_PLUGIN
|
|
602
|
+
value: libnccl-tuner.so
|
|
603
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
604
|
+
value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
|
|
605
|
+
- name: CUDA_VISIBLE_DEVICES
|
|
606
|
+
value: "0,1,2,3,4,5,6,7"
|
|
607
|
+
{% endif %}
|
|
608
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
609
|
+
- name: LD_LIBRARY_PATH
|
|
610
|
+
value: /usr/local/nvidia/lib64
|
|
611
|
+
- name: NCCL_NET
|
|
612
|
+
value: gIB
|
|
613
|
+
- name: NCCL_CROSS_NIC
|
|
614
|
+
value: "0"
|
|
615
|
+
- name: NCCL_NET_GDR_LEVEL
|
|
616
|
+
value: PIX
|
|
617
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
|
618
|
+
value: "131072"
|
|
619
|
+
- name: NCCL_NVLS_CHUNKSIZE
|
|
620
|
+
value: "524288"
|
|
621
|
+
- name: NCCL_IB_ADAPTIVE_ROUTING
|
|
622
|
+
value: "1"
|
|
623
|
+
- name: NCCL_IB_QPS_PER_CONNECTION
|
|
624
|
+
value: "4"
|
|
625
|
+
- name: NCCL_IB_TC
|
|
626
|
+
value: "52"
|
|
627
|
+
- name: NCCL_IB_FIFO_TC
|
|
628
|
+
value: "84"
|
|
629
|
+
{% if k8s_enable_gpudirect_rdma_a4 %}
|
|
630
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
631
|
+
value: /usr/local/gib/configs/tuner_config_a4.txtpb
|
|
632
|
+
{% else %}
|
|
633
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
634
|
+
value: /usr/local/gib/configs/tuner_config_a3u.txtpb
|
|
635
|
+
{% endif %}
|
|
636
|
+
{% endif %}
|
|
372
637
|
{% if k8s_fuse_device_required %}
|
|
373
638
|
- name: FUSERMOUNT_SHARED_DIR
|
|
374
639
|
value: {{k8s_fusermount_shared_dir}}
|
|
@@ -378,12 +643,17 @@ available_node_types:
|
|
|
378
643
|
command: ["/bin/bash", "-c", "--"]
|
|
379
644
|
args:
|
|
380
645
|
- |
|
|
381
|
-
#
|
|
382
|
-
#
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
#
|
|
386
|
-
|
|
646
|
+
# Set -x to print the commands and their arguments as they are executed.
|
|
647
|
+
# Useful for debugging.
|
|
648
|
+
set -x
|
|
649
|
+
|
|
650
|
+
# Execute user-provided post-provision runcmd
|
|
651
|
+
# before any of the SkyPilot setup commands.
|
|
652
|
+
{%- if runcmd %}
|
|
653
|
+
{%- for cmd in runcmd %}
|
|
654
|
+
{{cmd}}
|
|
655
|
+
{%- endfor %}
|
|
656
|
+
{%- endif %}
|
|
387
657
|
|
|
388
658
|
# Helper function to conditionally use sudo
|
|
389
659
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
@@ -395,14 +665,131 @@ available_node_types:
|
|
|
395
665
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
|
396
666
|
(
|
|
397
667
|
(
|
|
398
|
-
|
|
399
|
-
|
|
668
|
+
# Helper: run apt-get update with retries
|
|
669
|
+
apt_update_with_retries() {
|
|
670
|
+
# do not fail the whole shell; we handle return codes
|
|
671
|
+
set +e
|
|
672
|
+
local log=/tmp/apt-update.log
|
|
673
|
+
local tries=3
|
|
674
|
+
local delay=1
|
|
675
|
+
local i
|
|
676
|
+
for i in $(seq 1 $tries); do
|
|
677
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
|
|
678
|
+
echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
|
|
679
|
+
sleep $delay
|
|
680
|
+
delay=$((delay * 2))
|
|
681
|
+
done
|
|
682
|
+
set -e
|
|
683
|
+
return 1
|
|
684
|
+
}
|
|
685
|
+
apt_install_with_retries() {
|
|
686
|
+
local packages="$@"
|
|
687
|
+
[ -z "$packages" ] && return 0
|
|
688
|
+
set +e
|
|
689
|
+
local log=/tmp/apt-update.log
|
|
690
|
+
local tries=3
|
|
691
|
+
local delay=1
|
|
692
|
+
local i
|
|
693
|
+
for i in $(seq 1 $tries); do
|
|
694
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
|
|
695
|
+
echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
|
|
696
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
|
|
697
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
|
|
698
|
+
sleep $delay
|
|
699
|
+
delay=$((delay * 2))
|
|
700
|
+
done
|
|
701
|
+
set -e
|
|
702
|
+
return 1
|
|
703
|
+
}
|
|
704
|
+
apt_update_install_with_retries() {
|
|
705
|
+
apt_update_with_retries
|
|
706
|
+
apt_install_with_retries "$@"
|
|
707
|
+
}
|
|
708
|
+
backup_dir=/etc/apt/sources.list.backup_skypilot
|
|
709
|
+
backup_source() {
|
|
710
|
+
$(prefix_cmd) mkdir -p "$backup_dir"
|
|
711
|
+
if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
|
|
712
|
+
$(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
|
|
713
|
+
fi
|
|
714
|
+
}
|
|
715
|
+
restore_source() {
|
|
716
|
+
if [ -f "$backup_dir/sources.list" ]; then
|
|
717
|
+
$(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
|
|
718
|
+
fi
|
|
719
|
+
}
|
|
720
|
+
update_apt_sources() {
|
|
721
|
+
local host=$1
|
|
722
|
+
local apt_file=$2
|
|
723
|
+
$(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
|
|
724
|
+
}
|
|
725
|
+
# Helper: install packages across mirrors with retries
|
|
726
|
+
apt_install_with_mirrors() {
|
|
727
|
+
local required=$1; shift
|
|
728
|
+
local packages="$@"
|
|
729
|
+
[ -z "$packages" ] && return 0
|
|
730
|
+
set +e
|
|
731
|
+
# Install packages with default sources first
|
|
732
|
+
local log=/tmp/apt-update.log
|
|
733
|
+
echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
|
|
734
|
+
restore_source
|
|
735
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
736
|
+
echo "Install failed with default sources: $packages" >> "$log"
|
|
737
|
+
# Detect distro (ubuntu/debian)
|
|
738
|
+
local APT_OS="unknown"
|
|
739
|
+
if [ -f /etc/os-release ]; then
|
|
740
|
+
. /etc/os-release
|
|
741
|
+
case "$ID" in
|
|
742
|
+
debian) APT_OS="debian" ;;
|
|
743
|
+
ubuntu) APT_OS="ubuntu" ;;
|
|
744
|
+
*)
|
|
745
|
+
if [ -n "$ID_LIKE" ]; then
|
|
746
|
+
case " $ID $ID_LIKE " in
|
|
747
|
+
*ubuntu*) APT_OS="ubuntu" ;;
|
|
748
|
+
*debian*) APT_OS="debian" ;;
|
|
749
|
+
esac
|
|
750
|
+
fi
|
|
751
|
+
;;
|
|
752
|
+
esac
|
|
753
|
+
fi
|
|
754
|
+
# Build mirror candidates
|
|
755
|
+
# deb.debian.org is a CDN endpoint, if one backend goes down,
|
|
756
|
+
# the CDN automatically fails over to another mirror,
|
|
757
|
+
# so we only retry for ubuntu here.
|
|
758
|
+
if [ "$APT_OS" = "ubuntu" ]; then
|
|
759
|
+
# Backup current sources once
|
|
760
|
+
backup_source
|
|
761
|
+
# Selected from https://launchpad.net/ubuntu/+archivemirrors
|
|
762
|
+
# and results from apt-select
|
|
763
|
+
local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
|
|
764
|
+
for host in $MIRROR_CANDIDATES; do
|
|
765
|
+
echo "Trying APT mirror ($APT_OS): $host" >> "$log"
|
|
766
|
+
if [ -f /etc/apt/sources.list ]; then
|
|
767
|
+
update_apt_sources $host /etc/apt/sources.list
|
|
768
|
+
else
|
|
769
|
+
echo "Error: /etc/apt/sources.list not found" >> "$log"
|
|
770
|
+
break
|
|
771
|
+
fi
|
|
772
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
773
|
+
echo "Install failed with mirror ($APT_OS): $host" >> "$log"
|
|
774
|
+
# Restore to default sources
|
|
775
|
+
restore_source
|
|
776
|
+
done
|
|
777
|
+
fi
|
|
778
|
+
set -e
|
|
779
|
+
if [ "$required" = "1" ]; then
|
|
780
|
+
echo "Error: required package install failed across all mirrors: $packages" >> "$log"
|
|
781
|
+
return 1
|
|
782
|
+
else
|
|
783
|
+
echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
|
|
784
|
+
return 0
|
|
785
|
+
fi
|
|
786
|
+
}
|
|
400
787
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
|
401
788
|
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
|
402
789
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
|
403
790
|
|
|
404
791
|
# Separate packages into two groups: packages that are installed first
|
|
405
|
-
# so that curl, rsync and wget are available sooner to unblock the following
|
|
792
|
+
# so that curl, rsync, ssh and wget are available sooner to unblock the following
|
|
406
793
|
# conda installation and rsync.
|
|
407
794
|
# Also, we install fuse first to avoid confliction with fuse3.
|
|
408
795
|
set -e
|
|
@@ -423,7 +810,7 @@ available_node_types:
|
|
|
423
810
|
done;
|
|
424
811
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
425
812
|
echo "Installing core packages: $INSTALL_FIRST";
|
|
426
|
-
|
|
813
|
+
apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
|
|
427
814
|
fi;
|
|
428
815
|
# SSH and other packages are not necessary, so we disable set -e
|
|
429
816
|
set +e
|
|
@@ -447,7 +834,8 @@ available_node_types:
|
|
|
447
834
|
fi
|
|
448
835
|
$(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
|
|
449
836
|
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
|
450
|
-
|
|
837
|
+
# "|| true" because fusermount3 is not always available
|
|
838
|
+
FUSERMOUNT3_PATH=$(which fusermount3) || true
|
|
451
839
|
if [ -z "$FUSERMOUNT3_PATH" ]; then
|
|
452
840
|
FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
|
|
453
841
|
fi
|
|
@@ -489,16 +877,23 @@ available_node_types:
|
|
|
489
877
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
490
878
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
491
879
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
492
|
-
$(prefix_cmd) cat
|
|
880
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
881
|
+
skypilot:ssh_public_key_content
|
|
882
|
+
SKYPILOT_SSH_KEY_EOF
|
|
493
883
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
494
884
|
$(prefix_cmd) service ssh restart;
|
|
495
885
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
496
886
|
|
|
497
|
-
|
|
498
|
-
|
|
887
|
+
touch /tmp/apt_ssh_setup_complete
|
|
888
|
+
echo "=== SSH setup completed ==="
|
|
889
|
+
) > /tmp/${STEPS[0]}.log 2>&1
|
|
890
|
+
if [ "$?" -ne "0" ]; then
|
|
891
|
+
{
|
|
892
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
|
|
499
893
|
cat /tmp/${STEPS[0]}.log
|
|
500
894
|
exit 1
|
|
501
|
-
|
|
895
|
+
}
|
|
896
|
+
fi
|
|
502
897
|
) &
|
|
503
898
|
|
|
504
899
|
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
|
@@ -516,7 +911,21 @@ available_node_types:
|
|
|
516
911
|
{{ conda_installation_commands }}
|
|
517
912
|
{{ ray_installation_commands }}
|
|
518
913
|
|
|
519
|
-
|
|
914
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
915
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
916
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
917
|
+
# Wait for `patch` package to be installed before applying ray patches
|
|
918
|
+
until dpkg -l | grep -q "^ii patch "; do
|
|
919
|
+
sleep 0.1
|
|
920
|
+
echo "Waiting for patch package to be installed..."
|
|
921
|
+
done
|
|
922
|
+
# Apply Ray patches for progress bar fix
|
|
923
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
924
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
925
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
926
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
927
|
+
env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
928
|
+
}
|
|
520
929
|
touch /tmp/ray_skypilot_installation_complete
|
|
521
930
|
echo "=== Ray and skypilot installation completed ==="
|
|
522
931
|
|
|
@@ -544,11 +953,14 @@ available_node_types:
|
|
|
544
953
|
set +e
|
|
545
954
|
{{ ray_worker_start_command }}
|
|
546
955
|
fi
|
|
547
|
-
) > /tmp/${STEPS[1]}.log 2>&1
|
|
548
|
-
|
|
956
|
+
) > /tmp/${STEPS[1]}.log 2>&1
|
|
957
|
+
if [ "$?" -ne "0" ]; then
|
|
958
|
+
{
|
|
959
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
|
|
549
960
|
cat /tmp/${STEPS[1]}.log
|
|
550
961
|
exit 1
|
|
551
|
-
|
|
962
|
+
}
|
|
963
|
+
fi
|
|
552
964
|
) &
|
|
553
965
|
|
|
554
966
|
|
|
@@ -566,11 +978,14 @@ available_node_types:
|
|
|
566
978
|
fi;
|
|
567
979
|
fi;
|
|
568
980
|
export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
|
569
|
-
) > /tmp/${STEPS[2]}.log 2>&1
|
|
570
|
-
|
|
981
|
+
) > /tmp/${STEPS[2]}.log 2>&1
|
|
982
|
+
if [ "$?" -ne "0" ]; then
|
|
983
|
+
{
|
|
984
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
|
|
571
985
|
cat /tmp/${STEPS[2]}.log
|
|
572
986
|
exit 1
|
|
573
|
-
|
|
987
|
+
}
|
|
988
|
+
fi
|
|
574
989
|
) &
|
|
575
990
|
|
|
576
991
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
|
@@ -623,23 +1038,72 @@ available_node_types:
|
|
|
623
1038
|
{% if high_availability %}
|
|
624
1039
|
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
|
625
1040
|
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
|
1041
|
+
SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
|
|
1042
|
+
echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1043
|
+
start_time=$SECONDS
|
|
1044
|
+
retry_count=0
|
|
1045
|
+
|
|
1046
|
+
# Wait for Ray to be ready, as the following commands is depending on Ray.
|
|
1047
|
+
GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
|
|
1048
|
+
while true; do
|
|
1049
|
+
retry_count=$((retry_count + 1))
|
|
1050
|
+
current_duration=$(( SECONDS - start_time ))
|
|
1051
|
+
echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1052
|
+
|
|
1053
|
+
bash --login -c "$GET_RAY_STATUS_CMD"
|
|
1054
|
+
if [ $? -eq 0 ]; then
|
|
1055
|
+
wait_duration=$(( SECONDS - start_time ))
|
|
1056
|
+
echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1057
|
+
break
|
|
1058
|
+
fi
|
|
1059
|
+
echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1060
|
+
sleep 2
|
|
1061
|
+
done
|
|
1062
|
+
|
|
626
1063
|
# ! Keep this aligned with `CloudVmRayBackend._setup()`
|
|
627
|
-
# Suppose all `task.setup` are the same for
|
|
1064
|
+
# Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
|
|
628
1065
|
# So be careful for compatibility issue once you change it.
|
|
629
1066
|
chmod +x {{k8s_high_availability_deployment_setup_script_path}}
|
|
630
1067
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
|
|
631
|
-
echo "=== Controller setup commands completed for recovery ==="
|
|
1068
|
+
echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
632
1069
|
|
|
1070
|
+
touch {{k8s_high_availability_restarting_signal_file}}
|
|
1071
|
+
# Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
|
|
1072
|
+
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
1073
|
+
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
1074
|
+
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
1075
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
1076
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1077
|
+
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
1078
|
+
fi
|
|
633
1079
|
for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
|
|
1080
|
+
# This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
|
|
1081
|
+
# so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
|
|
1082
|
+
JOB_ID=$(basename $file | sed 's/sky_job_//')
|
|
1083
|
+
# If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
|
|
1084
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1085
|
+
if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
|
|
1086
|
+
continue
|
|
1087
|
+
fi
|
|
1088
|
+
fi
|
|
634
1089
|
# ! Keep this aligned with `CloudVmRayBackend._execute()`
|
|
635
1090
|
chmod +x $file
|
|
1091
|
+
# TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
|
|
1092
|
+
# We should do more tests and make sure it will scale well.
|
|
636
1093
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
|
|
637
|
-
echo "=== Controller task run for service (file: $file) completed for recovery ==="
|
|
1094
|
+
echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
638
1095
|
done
|
|
1096
|
+
rm {{k8s_high_availability_restarting_signal_file}}
|
|
1097
|
+
|
|
1098
|
+
duration=$(( SECONDS - start_time ))
|
|
1099
|
+
echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1100
|
+
echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
639
1101
|
fi
|
|
640
1102
|
|
|
641
1103
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
|
642
1104
|
{% endif %}
|
|
1105
|
+
# Set +x to stop printing the commands and their arguments as they are executed.
|
|
1106
|
+
set +x
|
|
643
1107
|
|
|
644
1108
|
trap : TERM INT; log_tail || sleep infinity & wait
|
|
645
1109
|
|
|
@@ -653,14 +1117,27 @@ available_node_types:
|
|
|
653
1117
|
# object store. If you do not provide this, Ray will fall back to
|
|
654
1118
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
655
1119
|
volumeMounts:
|
|
656
|
-
- name: secret-volume
|
|
657
|
-
readOnly: true
|
|
658
|
-
mountPath: "/etc/secret-volume"
|
|
659
|
-
# This volume allocates shared memory for Ray to use for its plasma
|
|
660
|
-
# object store. If you do not provide this, Ray will fall back to
|
|
661
|
-
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
662
1120
|
- mountPath: /dev/shm
|
|
663
1121
|
name: dshm
|
|
1122
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
1123
|
+
- name: tcpx-socket
|
|
1124
|
+
mountPath: /tmp
|
|
1125
|
+
- name: libraries
|
|
1126
|
+
mountPath: /usr/local/nvidia/lib64
|
|
1127
|
+
readOnly: true
|
|
1128
|
+
{% endif %}
|
|
1129
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
1130
|
+
- name: libraries
|
|
1131
|
+
mountPath: /usr/local/nvidia
|
|
1132
|
+
- name: aperture-devices
|
|
1133
|
+
mountPath: /dev/aperture_devices
|
|
1134
|
+
{% endif %}
|
|
1135
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
1136
|
+
- name: library-dir-host
|
|
1137
|
+
mountPath: /usr/local/nvidia
|
|
1138
|
+
- name: gib
|
|
1139
|
+
mountPath: /usr/local/gib
|
|
1140
|
+
{% endif %}
|
|
664
1141
|
{% if high_availability %}
|
|
665
1142
|
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
|
666
1143
|
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
|
@@ -669,6 +1146,10 @@ available_node_types:
|
|
|
669
1146
|
- name: fusermount-shared-dir
|
|
670
1147
|
mountPath: {{k8s_fusermount_shared_dir}}
|
|
671
1148
|
{% endif %}
|
|
1149
|
+
{% for volume_mount in volume_mounts %}
|
|
1150
|
+
- name: {{volume_mount.name}}
|
|
1151
|
+
mountPath: {{volume_mount.path}}
|
|
1152
|
+
{% endfor %}
|
|
672
1153
|
resources:
|
|
673
1154
|
requests:
|
|
674
1155
|
cpu: {{cpus}}
|
|
@@ -681,13 +1162,87 @@ available_node_types:
|
|
|
681
1162
|
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
|
|
682
1163
|
{{k8s_resource_key}}: {{accelerator_count}}
|
|
683
1164
|
{% endif %}
|
|
1165
|
+
{% if k8s_network_type == 'coreweave' %}
|
|
1166
|
+
rdma/ib: 1
|
|
1167
|
+
{% endif %}
|
|
684
1168
|
{% if k8s_resource_key is not none %}
|
|
685
1169
|
limits:
|
|
686
1170
|
# Limits need to be defined for GPU/TPU requests
|
|
687
1171
|
{% if k8s_resource_key is not none %}
|
|
688
1172
|
{{k8s_resource_key}}: {{accelerator_count}}
|
|
689
1173
|
{% endif %}
|
|
1174
|
+
{% if k8s_network_type == 'coreweave' %}
|
|
1175
|
+
rdma/ib: 1
|
|
1176
|
+
{% endif %}
|
|
690
1177
|
{% endif %}
|
|
1178
|
+
{% if k8s_ipc_lock_capability %}
|
|
1179
|
+
securityContext:
|
|
1180
|
+
capabilities:
|
|
1181
|
+
add:
|
|
1182
|
+
- IPC_LOCK
|
|
1183
|
+
{% endif %}
|
|
1184
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
1185
|
+
# GPUDirect TCPX daemon sidecar container
|
|
1186
|
+
- name: tcpx-daemon
|
|
1187
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
|
|
1188
|
+
imagePullPolicy: Always
|
|
1189
|
+
command:
|
|
1190
|
+
- /tcpgpudmarxd/build/app/tcpgpudmarxd
|
|
1191
|
+
- --gpu_nic_preset
|
|
1192
|
+
- a3vm
|
|
1193
|
+
- --gpu_shmem_type
|
|
1194
|
+
- fd
|
|
1195
|
+
- --uds_path
|
|
1196
|
+
- /run/tcpx
|
|
1197
|
+
- --setup_param
|
|
1198
|
+
- --verbose
|
|
1199
|
+
- "128"
|
|
1200
|
+
- "2"
|
|
1201
|
+
- "0"
|
|
1202
|
+
securityContext:
|
|
1203
|
+
capabilities:
|
|
1204
|
+
add:
|
|
1205
|
+
- NET_ADMIN
|
|
1206
|
+
volumeMounts:
|
|
1207
|
+
- name: libraries
|
|
1208
|
+
mountPath: /usr/local/nvidia/lib64
|
|
1209
|
+
readOnly: true
|
|
1210
|
+
- name: tcpx-socket
|
|
1211
|
+
mountPath: /run/tcpx
|
|
1212
|
+
- name: sys
|
|
1213
|
+
mountPath: /hostsysfs
|
|
1214
|
+
- name: proc-sys
|
|
1215
|
+
mountPath: /hostprocsysfs
|
|
1216
|
+
env:
|
|
1217
|
+
- name: LD_LIBRARY_PATH
|
|
1218
|
+
value: /usr/local/nvidia/lib64
|
|
1219
|
+
{% endif %}
|
|
1220
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
1221
|
+
- name: tcpxo-daemon
|
|
1222
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
|
|
1223
|
+
imagePullPolicy: Always
|
|
1224
|
+
command: ["/bin/sh", "-c"]
|
|
1225
|
+
args:
|
|
1226
|
+
- |
|
|
1227
|
+
set -ex
|
|
1228
|
+
chmod 755 /fts/entrypoint_rxdm_container.sh
|
|
1229
|
+
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
|
|
1230
|
+
securityContext:
|
|
1231
|
+
capabilities:
|
|
1232
|
+
add:
|
|
1233
|
+
- NET_ADMIN
|
|
1234
|
+
- NET_BIND_SERVICE
|
|
1235
|
+
volumeMounts:
|
|
1236
|
+
- name: libraries
|
|
1237
|
+
mountPath: /usr/local/nvidia
|
|
1238
|
+
- name: sys
|
|
1239
|
+
mountPath: /hostsysfs
|
|
1240
|
+
- name: proc-sys
|
|
1241
|
+
mountPath: /hostprocsysfs
|
|
1242
|
+
env:
|
|
1243
|
+
- name: LD_LIBRARY_PATH
|
|
1244
|
+
value: /usr/local/nvidia/lib64
|
|
1245
|
+
{% endif %}
|
|
691
1246
|
|
|
692
1247
|
{% if high_availability %}
|
|
693
1248
|
pvc_spec:
|
|
@@ -724,7 +1279,7 @@ available_node_types:
|
|
|
724
1279
|
spec:
|
|
725
1280
|
securityContext:
|
|
726
1281
|
fsGroup: 1000
|
|
727
|
-
# To prevent the home dir provided by the docker image from being
|
|
1282
|
+
# To prevent the home dir provided by the docker image from being overridden by pvc mounting,
|
|
728
1283
|
# we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
|
|
729
1284
|
initContainers:
|
|
730
1285
|
- name: init-copy-home
|
|
@@ -791,17 +1346,23 @@ setup_commands:
|
|
|
791
1346
|
{%- endfor %}
|
|
792
1347
|
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
|
793
1348
|
start_epoch=$(date +%s);
|
|
1349
|
+
|
|
1350
|
+
# Wait for SSH setup to complete before proceeding
|
|
1351
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
|
1352
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1353
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1354
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1355
|
+
|
|
794
1356
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
800
|
-
fi
|
|
1357
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1358
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1359
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1360
|
+
|
|
801
1361
|
end_epoch=$(date +%s);
|
|
802
1362
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
803
1363
|
start_epoch=$(date +%s);
|
|
804
1364
|
{{ skypilot_wheel_installation_commands }}
|
|
1365
|
+
{{ copy_skypilot_templates_commands }}
|
|
805
1366
|
end_epoch=$(date +%s);
|
|
806
1367
|
echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
807
1368
|
start_epoch=$(date +%s);
|