skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -212,7 +209,9 @@ provider:
|
|
|
212
209
|
metadata:
|
|
213
210
|
labels:
|
|
214
211
|
parent: skypilot
|
|
212
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
215
213
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
214
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
216
215
|
skypilot-user: {{ user }}
|
|
217
216
|
name: {{cluster_name_on_cloud}}-head-ssh
|
|
218
217
|
spec:
|
|
@@ -230,7 +229,9 @@ provider:
|
|
|
230
229
|
metadata:
|
|
231
230
|
labels:
|
|
232
231
|
parent: skypilot
|
|
232
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
233
233
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
234
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
234
235
|
skypilot-user: {{ user }}
|
|
235
236
|
# NOTE: If you're running multiple Ray clusters with services
|
|
236
237
|
# on one Kubernetes cluster, they must have unique service
|
|
@@ -243,6 +244,24 @@ provider:
|
|
|
243
244
|
# This selector must match the head node pod's selector below.
|
|
244
245
|
selector:
|
|
245
246
|
component: {{cluster_name_on_cloud}}-head
|
|
247
|
+
# Headless service mapping hostnames to rest of the worker nodes
|
|
248
|
+
{% for worker_id in range(1, num_nodes) %}
|
|
249
|
+
- apiVersion: v1
|
|
250
|
+
kind: Service
|
|
251
|
+
metadata:
|
|
252
|
+
labels:
|
|
253
|
+
parent: skypilot
|
|
254
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
255
|
+
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
256
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
257
|
+
skypilot-user: {{ user }}
|
|
258
|
+
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
259
|
+
spec:
|
|
260
|
+
selector:
|
|
261
|
+
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
262
|
+
clusterIP: None
|
|
263
|
+
{% endfor %}
|
|
264
|
+
|
|
246
265
|
|
|
247
266
|
# Specify the pod type for the ray head node (as configured below).
|
|
248
267
|
head_node_type: ray_head_default
|
|
@@ -255,13 +274,12 @@ available_node_types:
|
|
|
255
274
|
metadata:
|
|
256
275
|
# name will be filled in the provisioner
|
|
257
276
|
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
|
|
258
|
-
# service is required.
|
|
277
|
+
# service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
|
|
259
278
|
labels:
|
|
260
279
|
parent: skypilot
|
|
261
280
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
281
|
+
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
262
282
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
263
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
264
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
265
283
|
skypilot-user: {{ user }}
|
|
266
284
|
# Custom tags for the pods
|
|
267
285
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -273,14 +291,100 @@ available_node_types:
|
|
|
273
291
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
|
|
274
292
|
skypilot-binpack: "gpu"
|
|
275
293
|
{% endif %}
|
|
294
|
+
{% if k8s_kueue_local_queue_name %}
|
|
295
|
+
kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
|
|
296
|
+
kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
|
|
297
|
+
{% endif %}
|
|
298
|
+
{% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
|
|
299
|
+
annotations:
|
|
300
|
+
{% if k8s_kueue_local_queue_name %}
|
|
301
|
+
kueue.x-k8s.io/retriable-in-group: "false"
|
|
302
|
+
kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
|
|
303
|
+
{% if k8s_max_run_duration_seconds %}
|
|
304
|
+
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
|
|
305
|
+
{% endif %}
|
|
306
|
+
{% endif %}
|
|
307
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
|
|
308
|
+
# Values from google cloud guide
|
|
309
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
310
|
+
devices.gke.io/container.tcpx-daemon: |+
|
|
311
|
+
- path: /dev/nvidia0
|
|
312
|
+
- path: /dev/nvidia1
|
|
313
|
+
- path: /dev/nvidia2
|
|
314
|
+
- path: /dev/nvidia3
|
|
315
|
+
- path: /dev/nvidia4
|
|
316
|
+
- path: /dev/nvidia5
|
|
317
|
+
- path: /dev/nvidia6
|
|
318
|
+
- path: /dev/nvidia7
|
|
319
|
+
- path: /dev/nvidiactl
|
|
320
|
+
- path: /dev/nvidia-uvm
|
|
321
|
+
networking.gke.io/default-interface: 'eth0'
|
|
322
|
+
networking.gke.io/interfaces: |
|
|
323
|
+
[
|
|
324
|
+
{"interfaceName":"eth0","network":"default"},
|
|
325
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
|
326
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
|
327
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
|
328
|
+
{"interfaceName":"eth4","network":"vpc4"}
|
|
329
|
+
]
|
|
330
|
+
{% endif %}
|
|
331
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
332
|
+
devices.gke.io/container.tcpxo-daemon: |+
|
|
333
|
+
- path: /dev/nvidia0
|
|
334
|
+
- path: /dev/nvidia1
|
|
335
|
+
- path: /dev/nvidia2
|
|
336
|
+
- path: /dev/nvidia3
|
|
337
|
+
- path: /dev/nvidia4
|
|
338
|
+
- path: /dev/nvidia5
|
|
339
|
+
- path: /dev/nvidia6
|
|
340
|
+
- path: /dev/nvidia7
|
|
341
|
+
- path: /dev/nvidiactl
|
|
342
|
+
- path: /dev/nvidia-uvm
|
|
343
|
+
- path: /dev/dmabuf_import_helper
|
|
344
|
+
networking.gke.io/default-interface: 'eth0'
|
|
345
|
+
networking.gke.io/interfaces: |
|
|
346
|
+
[
|
|
347
|
+
{"interfaceName":"eth0","network":"default"},
|
|
348
|
+
{"interfaceName":"eth1","network":"vpc1"},
|
|
349
|
+
{"interfaceName":"eth2","network":"vpc2"},
|
|
350
|
+
{"interfaceName":"eth3","network":"vpc3"},
|
|
351
|
+
{"interfaceName":"eth4","network":"vpc4"},
|
|
352
|
+
{"interfaceName":"eth5","network":"vpc5"},
|
|
353
|
+
{"interfaceName":"eth6","network":"vpc6"},
|
|
354
|
+
{"interfaceName":"eth7","network":"vpc7"},
|
|
355
|
+
{"interfaceName":"eth8","network":"vpc8"}
|
|
356
|
+
]
|
|
357
|
+
{% endif %}
|
|
358
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
359
|
+
networking.gke.io/default-interface: 'eth0'
|
|
360
|
+
networking.gke.io/interfaces: |
|
|
361
|
+
[
|
|
362
|
+
{"interfaceName":"eth0","network":"default"},
|
|
363
|
+
{"interfaceName":"eth1","network":"gvnic-1"},
|
|
364
|
+
{"interfaceName":"eth2","network":"rdma-0"},
|
|
365
|
+
{"interfaceName":"eth3","network":"rdma-1"},
|
|
366
|
+
{"interfaceName":"eth4","network":"rdma-2"},
|
|
367
|
+
{"interfaceName":"eth5","network":"rdma-3"},
|
|
368
|
+
{"interfaceName":"eth6","network":"rdma-4"},
|
|
369
|
+
{"interfaceName":"eth7","network":"rdma-5"},
|
|
370
|
+
{"interfaceName":"eth8","network":"rdma-6"},
|
|
371
|
+
{"interfaceName":"eth9","network":"rdma-7"}
|
|
372
|
+
]
|
|
373
|
+
{% endif %}
|
|
374
|
+
{% endif %}
|
|
276
375
|
spec:
|
|
277
376
|
# serviceAccountName: skypilot-service-account
|
|
278
377
|
serviceAccountName: {{k8s_service_account_name}}
|
|
279
378
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
|
280
379
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
|
380
|
+
{% if volume_mounts %}
|
|
381
|
+
securityContext:
|
|
382
|
+
fsGroup: 1000
|
|
383
|
+
fsGroupChangePolicy: OnRootMismatch
|
|
384
|
+
{% endif %}
|
|
281
385
|
|
|
282
386
|
# Add node selector if GPU/TPUs are requested:
|
|
283
|
-
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
|
387
|
+
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) or (k8s_enable_flex_start) %}
|
|
284
388
|
nodeSelector:
|
|
285
389
|
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
|
286
390
|
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
|
@@ -288,6 +392,9 @@ available_node_types:
|
|
|
288
392
|
{% if k8s_spot_label_key is not none %}
|
|
289
393
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
|
290
394
|
{% endif %}
|
|
395
|
+
{% if k8s_enable_flex_start %}
|
|
396
|
+
cloud.google.com/gke-flex-start: "true"
|
|
397
|
+
{% endif %}
|
|
291
398
|
{% endif %}
|
|
292
399
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) or (avoid_label_keys is not none) %}
|
|
293
400
|
affinity:
|
|
@@ -339,9 +446,6 @@ available_node_types:
|
|
|
339
446
|
# object store. If you do not provide this, Ray will fall back to
|
|
340
447
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
341
448
|
volumes:
|
|
342
|
-
- name: secret-volume
|
|
343
|
-
secret:
|
|
344
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
345
449
|
- name: dshm
|
|
346
450
|
emptyDir:
|
|
347
451
|
medium: Memory
|
|
@@ -356,19 +460,176 @@ available_node_types:
|
|
|
356
460
|
persistentVolumeClaim:
|
|
357
461
|
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
|
358
462
|
{% endif %}
|
|
463
|
+
{% for volume_mount in volume_mounts %}
|
|
464
|
+
- name: {{volume_mount.name}}
|
|
465
|
+
persistentVolumeClaim:
|
|
466
|
+
claimName: {{volume_mount.volume_name_on_cloud}}
|
|
467
|
+
{% endfor %}
|
|
468
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
469
|
+
- name: libraries
|
|
470
|
+
hostPath:
|
|
471
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
472
|
+
- name: tcpx-socket
|
|
473
|
+
emptyDir: {}
|
|
474
|
+
- name: sys
|
|
475
|
+
hostPath:
|
|
476
|
+
path: /sys
|
|
477
|
+
- name: proc-sys
|
|
478
|
+
hostPath:
|
|
479
|
+
path: /proc/sys
|
|
480
|
+
{% endif %}
|
|
481
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
482
|
+
- name: libraries
|
|
483
|
+
hostPath:
|
|
484
|
+
path: /home/kubernetes/bin/nvidia
|
|
485
|
+
- name: sys
|
|
486
|
+
hostPath:
|
|
487
|
+
path: /sys
|
|
488
|
+
- name: proc-sys
|
|
489
|
+
hostPath:
|
|
490
|
+
path: /proc/sys
|
|
491
|
+
- name: aperture-devices
|
|
492
|
+
hostPath:
|
|
493
|
+
path: /dev/aperture_devices
|
|
494
|
+
{% endif %}
|
|
495
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
496
|
+
- name: library-dir-host
|
|
497
|
+
hostPath:
|
|
498
|
+
path: /home/kubernetes/bin/nvidia
|
|
499
|
+
- name: gib
|
|
500
|
+
hostPath:
|
|
501
|
+
path: /home/kubernetes/bin/gib
|
|
502
|
+
{% endif %}
|
|
359
503
|
containers:
|
|
360
504
|
- name: ray-node
|
|
361
|
-
imagePullPolicy:
|
|
505
|
+
imagePullPolicy: Always
|
|
362
506
|
image: {{image_id}}
|
|
363
507
|
env:
|
|
364
508
|
- name: SKYPILOT_POD_NODE_TYPE
|
|
365
509
|
valueFrom:
|
|
366
510
|
fieldRef:
|
|
367
511
|
fieldPath: metadata.labels['ray-node-type']
|
|
512
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
513
|
+
valueFrom:
|
|
514
|
+
resourceFieldRef:
|
|
515
|
+
containerName: ray-node
|
|
516
|
+
resource: requests.cpu
|
|
517
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
518
|
+
valueFrom:
|
|
519
|
+
resourceFieldRef:
|
|
520
|
+
containerName: ray-node
|
|
521
|
+
resource: requests.memory
|
|
368
522
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
369
523
|
- name: {{ key }}
|
|
370
524
|
value: {{ value }}
|
|
371
525
|
{% endfor %}
|
|
526
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
|
|
527
|
+
# Page recommends setting NCCL values for GPUDirect TCPX for best performance.
|
|
528
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
529
|
+
- name: LD_LIBRARY_PATH
|
|
530
|
+
value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
|
|
531
|
+
- name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
|
|
532
|
+
value: eth1,eth2,eth3,eth4
|
|
533
|
+
- name: NCCL_GPUDIRECTTCPX_CTRL_DEV
|
|
534
|
+
value: eth0
|
|
535
|
+
- name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
|
|
536
|
+
value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
|
|
537
|
+
- name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
|
|
538
|
+
value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
|
|
539
|
+
- name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
|
|
540
|
+
value: "500000"
|
|
541
|
+
- name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
|
|
542
|
+
value: "/tmp"
|
|
543
|
+
- name: NCCL_GPUDIRECTTCPX_FORCE_ACK
|
|
544
|
+
value: "0"
|
|
545
|
+
- name: NCCL_SOCKET_IFNAME
|
|
546
|
+
value: eth0
|
|
547
|
+
- name: NCCL_CROSS_NIC
|
|
548
|
+
value: "0"
|
|
549
|
+
- name: NCCL_ALGO
|
|
550
|
+
value: Ring
|
|
551
|
+
- name: NCCL_PROTO
|
|
552
|
+
value: Simple
|
|
553
|
+
- name: NCCL_NSOCKS_PERTHREAD
|
|
554
|
+
value: "4"
|
|
555
|
+
- name: NCCL_SOCKET_NTHREADS
|
|
556
|
+
value: "1"
|
|
557
|
+
- name: NCCL_NET_GDR_LEVEL
|
|
558
|
+
value: PIX
|
|
559
|
+
- name: NCCL_DYNAMIC_CHUNK_SIZE
|
|
560
|
+
value: "524288"
|
|
561
|
+
- name: NCCL_P2P_PXN_LEVEL
|
|
562
|
+
value: "0"
|
|
563
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
|
564
|
+
value: "524288"
|
|
565
|
+
- name: NCCL_P2P_PCI_CHUNKSIZE
|
|
566
|
+
value: "524288"
|
|
567
|
+
- name: NCCL_P2P_NVL_CHUNKSIZE
|
|
568
|
+
value: "1048576"
|
|
569
|
+
- name: NCCL_BUFFSIZE
|
|
570
|
+
value: "4194304"
|
|
571
|
+
- name: NCCL_MAX_NCHANNELS
|
|
572
|
+
value: "8"
|
|
573
|
+
- name: NCCL_MIN_NCHANNELS
|
|
574
|
+
value: "8"
|
|
575
|
+
- name: CUDA_VISIBLE_DEVICES
|
|
576
|
+
value: "0,1,2,3,4,5,6,7"
|
|
577
|
+
{% endif %}
|
|
578
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
579
|
+
- name: LD_LIBRARY_PATH
|
|
580
|
+
value: /usr/local/nvidia/lib64
|
|
581
|
+
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
|
|
582
|
+
value: /dev/aperture_devices
|
|
583
|
+
- name: NCCL_FASTRAK_CTRL_DEV
|
|
584
|
+
value: eth0
|
|
585
|
+
- name: NCCL_FASTRAK_IFNAME
|
|
586
|
+
value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
|
|
587
|
+
- name: NCCL_SOCKET_IFNAME
|
|
588
|
+
value: eth0
|
|
589
|
+
- name: NCCL_CROSS_NIC
|
|
590
|
+
value: "0"
|
|
591
|
+
- name: NCCL_ALGO
|
|
592
|
+
value: Ring,Tree
|
|
593
|
+
- name: NCCL_PROTO
|
|
594
|
+
value: Simple,LL128
|
|
595
|
+
- name: NCCL_MIN_NCHANNELS
|
|
596
|
+
value: "4"
|
|
597
|
+
- name: NCCL_TUNER_PLUGIN
|
|
598
|
+
value: libnccl-tuner.so
|
|
599
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
600
|
+
value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
|
|
601
|
+
- name: CUDA_VISIBLE_DEVICES
|
|
602
|
+
value: "0,1,2,3,4,5,6,7"
|
|
603
|
+
{% endif %}
|
|
604
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
605
|
+
- name: LD_LIBRARY_PATH
|
|
606
|
+
value: /usr/local/nvidia/lib64
|
|
607
|
+
- name: NCCL_NET
|
|
608
|
+
value: gIB
|
|
609
|
+
- name: NCCL_CROSS_NIC
|
|
610
|
+
value: "0"
|
|
611
|
+
- name: NCCL_NET_GDR_LEVEL
|
|
612
|
+
value: PIX
|
|
613
|
+
- name: NCCL_P2P_NET_CHUNKSIZE
|
|
614
|
+
value: "131072"
|
|
615
|
+
- name: NCCL_NVLS_CHUNKSIZE
|
|
616
|
+
value: "524288"
|
|
617
|
+
- name: NCCL_IB_ADAPTIVE_ROUTING
|
|
618
|
+
value: "1"
|
|
619
|
+
- name: NCCL_IB_QPS_PER_CONNECTION
|
|
620
|
+
value: "4"
|
|
621
|
+
- name: NCCL_IB_TC
|
|
622
|
+
value: "52"
|
|
623
|
+
- name: NCCL_IB_FIFO_TC
|
|
624
|
+
value: "84"
|
|
625
|
+
{% if k8s_enable_gpudirect_rdma_a4 %}
|
|
626
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
627
|
+
value: /usr/local/gib/configs/tuner_config_a4.txtpb
|
|
628
|
+
{% else %}
|
|
629
|
+
- name: NCCL_TUNER_CONFIG_PATH
|
|
630
|
+
value: /usr/local/gib/configs/tuner_config_a3u.txtpb
|
|
631
|
+
{% endif %}
|
|
632
|
+
{% endif %}
|
|
372
633
|
{% if k8s_fuse_device_required %}
|
|
373
634
|
- name: FUSERMOUNT_SHARED_DIR
|
|
374
635
|
value: {{k8s_fusermount_shared_dir}}
|
|
@@ -378,13 +639,9 @@ available_node_types:
|
|
|
378
639
|
command: ["/bin/bash", "-c", "--"]
|
|
379
640
|
args:
|
|
380
641
|
- |
|
|
381
|
-
#
|
|
382
|
-
#
|
|
383
|
-
|
|
384
|
-
# TODO: Remove this marker file and it's usage in setup_commands
|
|
385
|
-
# after v0.10.0 release.
|
|
386
|
-
touch /tmp/skypilot_is_nimbus
|
|
387
|
-
|
|
642
|
+
# Set -x to print the commands and their arguments as they are executed.
|
|
643
|
+
# Useful for debugging.
|
|
644
|
+
set -x
|
|
388
645
|
# Helper function to conditionally use sudo
|
|
389
646
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
390
647
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
|
@@ -395,14 +652,138 @@ available_node_types:
|
|
|
395
652
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
|
396
653
|
(
|
|
397
654
|
(
|
|
398
|
-
|
|
399
|
-
|
|
655
|
+
# For backwards compatibility, we put a marker file in the pod
|
|
656
|
+
# to indicate that the apt ssh setup step will write a completion
|
|
657
|
+
# marker file (/tmp/apt_ssh_setup_complete) to the pod.
|
|
658
|
+
# TODO: Remove this marker file and its usage in setup_commands
|
|
659
|
+
# after v0.11.0 release.
|
|
660
|
+
touch /tmp/apt_ssh_setup_started
|
|
661
|
+
|
|
662
|
+
# Helper: run apt-get update with retries
|
|
663
|
+
apt_update_with_retries() {
|
|
664
|
+
# do not fail the whole shell; we handle return codes
|
|
665
|
+
set +e
|
|
666
|
+
local log=/tmp/apt-update.log
|
|
667
|
+
local tries=3
|
|
668
|
+
local delay=1
|
|
669
|
+
local i
|
|
670
|
+
for i in $(seq 1 $tries); do
|
|
671
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
|
|
672
|
+
echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
|
|
673
|
+
sleep $delay
|
|
674
|
+
delay=$((delay * 2))
|
|
675
|
+
done
|
|
676
|
+
set -e
|
|
677
|
+
return 1
|
|
678
|
+
}
|
|
679
|
+
apt_install_with_retries() {
|
|
680
|
+
local packages="$@"
|
|
681
|
+
[ -z "$packages" ] && return 0
|
|
682
|
+
set +e
|
|
683
|
+
local log=/tmp/apt-update.log
|
|
684
|
+
local tries=3
|
|
685
|
+
local delay=1
|
|
686
|
+
local i
|
|
687
|
+
for i in $(seq 1 $tries); do
|
|
688
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
|
|
689
|
+
echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
|
|
690
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
|
|
691
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
|
|
692
|
+
sleep $delay
|
|
693
|
+
delay=$((delay * 2))
|
|
694
|
+
done
|
|
695
|
+
set -e
|
|
696
|
+
return 1
|
|
697
|
+
}
|
|
698
|
+
apt_update_install_with_retries() {
|
|
699
|
+
apt_update_with_retries
|
|
700
|
+
apt_install_with_retries "$@"
|
|
701
|
+
}
|
|
702
|
+
backup_dir=/etc/apt/sources.list.backup_skypilot
|
|
703
|
+
backup_source() {
|
|
704
|
+
$(prefix_cmd) mkdir -p "$backup_dir"
|
|
705
|
+
if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
|
|
706
|
+
$(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
|
|
707
|
+
fi
|
|
708
|
+
}
|
|
709
|
+
restore_source() {
|
|
710
|
+
if [ -f "$backup_dir/sources.list" ]; then
|
|
711
|
+
$(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
|
|
712
|
+
fi
|
|
713
|
+
}
|
|
714
|
+
update_apt_sources() {
|
|
715
|
+
local host=$1
|
|
716
|
+
local apt_file=$2
|
|
717
|
+
$(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
|
|
718
|
+
}
|
|
719
|
+
# Helper: install packages across mirrors with retries
|
|
720
|
+
apt_install_with_mirrors() {
|
|
721
|
+
local required=$1; shift
|
|
722
|
+
local packages="$@"
|
|
723
|
+
[ -z "$packages" ] && return 0
|
|
724
|
+
set +e
|
|
725
|
+
# Install packages with default sources first
|
|
726
|
+
local log=/tmp/apt-update.log
|
|
727
|
+
echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
|
|
728
|
+
restore_source
|
|
729
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
730
|
+
echo "Install failed with default sources: $packages" >> "$log"
|
|
731
|
+
# Detect distro (ubuntu/debian)
|
|
732
|
+
local APT_OS="unknown"
|
|
733
|
+
if [ -f /etc/os-release ]; then
|
|
734
|
+
. /etc/os-release
|
|
735
|
+
case "$ID" in
|
|
736
|
+
debian) APT_OS="debian" ;;
|
|
737
|
+
ubuntu) APT_OS="ubuntu" ;;
|
|
738
|
+
*)
|
|
739
|
+
if [ -n "$ID_LIKE" ]; then
|
|
740
|
+
case " $ID $ID_LIKE " in
|
|
741
|
+
*ubuntu*) APT_OS="ubuntu" ;;
|
|
742
|
+
*debian*) APT_OS="debian" ;;
|
|
743
|
+
esac
|
|
744
|
+
fi
|
|
745
|
+
;;
|
|
746
|
+
esac
|
|
747
|
+
fi
|
|
748
|
+
# Build mirror candidates
|
|
749
|
+
# deb.debian.org is a CDN endpoint, if one backend goes down,
|
|
750
|
+
# the CDN automatically fails over to another mirror,
|
|
751
|
+
# so we only retry for ubuntu here.
|
|
752
|
+
if [ "$APT_OS" = "ubuntu" ]; then
|
|
753
|
+
# Backup current sources once
|
|
754
|
+
backup_source
|
|
755
|
+
# Selected from https://launchpad.net/ubuntu/+archivemirrors
|
|
756
|
+
# and results from apt-select
|
|
757
|
+
local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
|
|
758
|
+
for host in $MIRROR_CANDIDATES; do
|
|
759
|
+
echo "Trying APT mirror ($APT_OS): $host" >> "$log"
|
|
760
|
+
if [ -f /etc/apt/sources.list ]; then
|
|
761
|
+
update_apt_sources $host /etc/apt/sources.list
|
|
762
|
+
else
|
|
763
|
+
echo "Error: /etc/apt/sources.list not found" >> "$log"
|
|
764
|
+
break
|
|
765
|
+
fi
|
|
766
|
+
apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
|
|
767
|
+
echo "Install failed with mirror ($APT_OS): $host" >> "$log"
|
|
768
|
+
# Restore to default sources
|
|
769
|
+
restore_source
|
|
770
|
+
done
|
|
771
|
+
fi
|
|
772
|
+
set -e
|
|
773
|
+
if [ "$required" = "1" ]; then
|
|
774
|
+
echo "Error: required package install failed across all mirrors: $packages" >> "$log"
|
|
775
|
+
return 1
|
|
776
|
+
else
|
|
777
|
+
echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
|
|
778
|
+
return 0
|
|
779
|
+
fi
|
|
780
|
+
}
|
|
400
781
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
|
401
782
|
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
|
402
783
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
|
403
784
|
|
|
404
785
|
# Separate packages into two groups: packages that are installed first
|
|
405
|
-
# so that curl, rsync and wget are available sooner to unblock the following
|
|
786
|
+
# so that curl, rsync, ssh and wget are available sooner to unblock the following
|
|
406
787
|
# conda installation and rsync.
|
|
407
788
|
# Also, we install fuse first to avoid confliction with fuse3.
|
|
408
789
|
set -e
|
|
@@ -423,7 +804,7 @@ available_node_types:
|
|
|
423
804
|
done;
|
|
424
805
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
|
425
806
|
echo "Installing core packages: $INSTALL_FIRST";
|
|
426
|
-
|
|
807
|
+
apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
|
|
427
808
|
fi;
|
|
428
809
|
# SSH and other packages are not necessary, so we disable set -e
|
|
429
810
|
set +e
|
|
@@ -447,7 +828,8 @@ available_node_types:
|
|
|
447
828
|
fi
|
|
448
829
|
$(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
|
|
449
830
|
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
|
450
|
-
|
|
831
|
+
# "|| true" because fusermount3 is not always available
|
|
832
|
+
FUSERMOUNT3_PATH=$(which fusermount3) || true
|
|
451
833
|
if [ -z "$FUSERMOUNT3_PATH" ]; then
|
|
452
834
|
FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
|
|
453
835
|
fi
|
|
@@ -489,16 +871,23 @@ available_node_types:
|
|
|
489
871
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
490
872
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
491
873
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
492
|
-
$(prefix_cmd) cat
|
|
874
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
875
|
+
skypilot:ssh_public_key_content
|
|
876
|
+
SKYPILOT_SSH_KEY_EOF
|
|
493
877
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
494
878
|
$(prefix_cmd) service ssh restart;
|
|
495
879
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
496
880
|
|
|
497
|
-
|
|
498
|
-
|
|
881
|
+
touch /tmp/apt_ssh_setup_complete
|
|
882
|
+
echo "=== SSH setup completed ==="
|
|
883
|
+
) > /tmp/${STEPS[0]}.log 2>&1
|
|
884
|
+
if [ "$?" -ne "0" ]; then
|
|
885
|
+
{
|
|
886
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
|
|
499
887
|
cat /tmp/${STEPS[0]}.log
|
|
500
888
|
exit 1
|
|
501
|
-
|
|
889
|
+
}
|
|
890
|
+
fi
|
|
502
891
|
) &
|
|
503
892
|
|
|
504
893
|
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
|
@@ -516,7 +905,21 @@ available_node_types:
|
|
|
516
905
|
{{ conda_installation_commands }}
|
|
517
906
|
{{ ray_installation_commands }}
|
|
518
907
|
|
|
519
|
-
|
|
908
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
909
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
910
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
911
|
+
# Wait for `patch` package to be installed before applying ray patches
|
|
912
|
+
until dpkg -l | grep -q "^ii patch "; do
|
|
913
|
+
sleep 0.1
|
|
914
|
+
echo "Waiting for patch package to be installed..."
|
|
915
|
+
done
|
|
916
|
+
# Apply Ray patches for progress bar fix
|
|
917
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
918
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
919
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
920
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
921
|
+
env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
922
|
+
}
|
|
520
923
|
touch /tmp/ray_skypilot_installation_complete
|
|
521
924
|
echo "=== Ray and skypilot installation completed ==="
|
|
522
925
|
|
|
@@ -544,11 +947,14 @@ available_node_types:
|
|
|
544
947
|
set +e
|
|
545
948
|
{{ ray_worker_start_command }}
|
|
546
949
|
fi
|
|
547
|
-
) > /tmp/${STEPS[1]}.log 2>&1
|
|
548
|
-
|
|
950
|
+
) > /tmp/${STEPS[1]}.log 2>&1
|
|
951
|
+
if [ "$?" -ne "0" ]; then
|
|
952
|
+
{
|
|
953
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
|
|
549
954
|
cat /tmp/${STEPS[1]}.log
|
|
550
955
|
exit 1
|
|
551
|
-
|
|
956
|
+
}
|
|
957
|
+
fi
|
|
552
958
|
) &
|
|
553
959
|
|
|
554
960
|
|
|
@@ -566,11 +972,14 @@ available_node_types:
|
|
|
566
972
|
fi;
|
|
567
973
|
fi;
|
|
568
974
|
export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
|
569
|
-
) > /tmp/${STEPS[2]}.log 2>&1
|
|
570
|
-
|
|
975
|
+
) > /tmp/${STEPS[2]}.log 2>&1
|
|
976
|
+
if [ "$?" -ne "0" ]; then
|
|
977
|
+
{
|
|
978
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
|
|
571
979
|
cat /tmp/${STEPS[2]}.log
|
|
572
980
|
exit 1
|
|
573
|
-
|
|
981
|
+
}
|
|
982
|
+
fi
|
|
574
983
|
) &
|
|
575
984
|
|
|
576
985
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
|
@@ -623,23 +1032,72 @@ available_node_types:
|
|
|
623
1032
|
{% if high_availability %}
|
|
624
1033
|
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
|
625
1034
|
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
|
1035
|
+
SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
|
|
1036
|
+
echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1037
|
+
start_time=$SECONDS
|
|
1038
|
+
retry_count=0
|
|
1039
|
+
|
|
1040
|
+
# Wait for Ray to be ready, as the following commands is depending on Ray.
|
|
1041
|
+
GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
|
|
1042
|
+
while true; do
|
|
1043
|
+
retry_count=$((retry_count + 1))
|
|
1044
|
+
current_duration=$(( SECONDS - start_time ))
|
|
1045
|
+
echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1046
|
+
|
|
1047
|
+
bash --login -c "$GET_RAY_STATUS_CMD"
|
|
1048
|
+
if [ $? -eq 0 ]; then
|
|
1049
|
+
wait_duration=$(( SECONDS - start_time ))
|
|
1050
|
+
echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1051
|
+
break
|
|
1052
|
+
fi
|
|
1053
|
+
echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1054
|
+
sleep 2
|
|
1055
|
+
done
|
|
1056
|
+
|
|
626
1057
|
# ! Keep this aligned with `CloudVmRayBackend._setup()`
|
|
627
|
-
# Suppose all `task.setup` are the same for
|
|
1058
|
+
# Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
|
|
628
1059
|
# So be careful for compatibility issue once you change it.
|
|
629
1060
|
chmod +x {{k8s_high_availability_deployment_setup_script_path}}
|
|
630
1061
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
|
|
631
|
-
echo "=== Controller setup commands completed for recovery ==="
|
|
1062
|
+
echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
632
1063
|
|
|
1064
|
+
touch {{k8s_high_availability_restarting_signal_file}}
|
|
1065
|
+
# Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
|
|
1066
|
+
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
1067
|
+
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
1068
|
+
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
1069
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
1070
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1071
|
+
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
1072
|
+
fi
|
|
633
1073
|
for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
|
|
1074
|
+
# This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
|
|
1075
|
+
# so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
|
|
1076
|
+
JOB_ID=$(basename $file | sed 's/sky_job_//')
|
|
1077
|
+
# If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
|
|
1078
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1079
|
+
if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
|
|
1080
|
+
continue
|
|
1081
|
+
fi
|
|
1082
|
+
fi
|
|
634
1083
|
# ! Keep this aligned with `CloudVmRayBackend._execute()`
|
|
635
1084
|
chmod +x $file
|
|
1085
|
+
# TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
|
|
1086
|
+
# We should do more tests and make sure it will scale well.
|
|
636
1087
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
|
|
637
|
-
echo "=== Controller task run for service (file: $file) completed for recovery ==="
|
|
1088
|
+
echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
638
1089
|
done
|
|
1090
|
+
rm {{k8s_high_availability_restarting_signal_file}}
|
|
1091
|
+
|
|
1092
|
+
duration=$(( SECONDS - start_time ))
|
|
1093
|
+
echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
1094
|
+
echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
|
|
639
1095
|
fi
|
|
640
1096
|
|
|
641
1097
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
|
642
1098
|
{% endif %}
|
|
1099
|
+
# Set +x to stop printing the commands and their arguments as they are executed.
|
|
1100
|
+
set +x
|
|
643
1101
|
|
|
644
1102
|
trap : TERM INT; log_tail || sleep infinity & wait
|
|
645
1103
|
|
|
@@ -653,14 +1111,27 @@ available_node_types:
|
|
|
653
1111
|
# object store. If you do not provide this, Ray will fall back to
|
|
654
1112
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
655
1113
|
volumeMounts:
|
|
656
|
-
- name: secret-volume
|
|
657
|
-
readOnly: true
|
|
658
|
-
mountPath: "/etc/secret-volume"
|
|
659
|
-
# This volume allocates shared memory for Ray to use for its plasma
|
|
660
|
-
# object store. If you do not provide this, Ray will fall back to
|
|
661
|
-
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
662
1114
|
- mountPath: /dev/shm
|
|
663
1115
|
name: dshm
|
|
1116
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
1117
|
+
- name: tcpx-socket
|
|
1118
|
+
mountPath: /tmp
|
|
1119
|
+
- name: libraries
|
|
1120
|
+
mountPath: /usr/local/nvidia/lib64
|
|
1121
|
+
readOnly: true
|
|
1122
|
+
{% endif %}
|
|
1123
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
1124
|
+
- name: libraries
|
|
1125
|
+
mountPath: /usr/local/nvidia
|
|
1126
|
+
- name: aperture-devices
|
|
1127
|
+
mountPath: /dev/aperture_devices
|
|
1128
|
+
{% endif %}
|
|
1129
|
+
{% if k8s_enable_gpudirect_rdma %}
|
|
1130
|
+
- name: library-dir-host
|
|
1131
|
+
mountPath: /usr/local/nvidia
|
|
1132
|
+
- name: gib
|
|
1133
|
+
mountPath: /usr/local/gib
|
|
1134
|
+
{% endif %}
|
|
664
1135
|
{% if high_availability %}
|
|
665
1136
|
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
|
666
1137
|
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
|
@@ -669,6 +1140,10 @@ available_node_types:
|
|
|
669
1140
|
- name: fusermount-shared-dir
|
|
670
1141
|
mountPath: {{k8s_fusermount_shared_dir}}
|
|
671
1142
|
{% endif %}
|
|
1143
|
+
{% for volume_mount in volume_mounts %}
|
|
1144
|
+
- name: {{volume_mount.name}}
|
|
1145
|
+
mountPath: {{volume_mount.path}}
|
|
1146
|
+
{% endfor %}
|
|
672
1147
|
resources:
|
|
673
1148
|
requests:
|
|
674
1149
|
cpu: {{cpus}}
|
|
@@ -681,13 +1156,87 @@ available_node_types:
|
|
|
681
1156
|
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
|
|
682
1157
|
{{k8s_resource_key}}: {{accelerator_count}}
|
|
683
1158
|
{% endif %}
|
|
1159
|
+
{% if k8s_network_type == 'coreweave' %}
|
|
1160
|
+
rdma/ib: 1
|
|
1161
|
+
{% endif %}
|
|
684
1162
|
{% if k8s_resource_key is not none %}
|
|
685
1163
|
limits:
|
|
686
1164
|
# Limits need to be defined for GPU/TPU requests
|
|
687
1165
|
{% if k8s_resource_key is not none %}
|
|
688
1166
|
{{k8s_resource_key}}: {{accelerator_count}}
|
|
689
1167
|
{% endif %}
|
|
1168
|
+
{% if k8s_network_type == 'coreweave' %}
|
|
1169
|
+
rdma/ib: 1
|
|
1170
|
+
{% endif %}
|
|
690
1171
|
{% endif %}
|
|
1172
|
+
{% if k8s_ipc_lock_capability %}
|
|
1173
|
+
securityContext:
|
|
1174
|
+
capabilities:
|
|
1175
|
+
add:
|
|
1176
|
+
- IPC_LOCK
|
|
1177
|
+
{% endif %}
|
|
1178
|
+
{% if k8s_enable_gpudirect_tcpx %}
|
|
1179
|
+
# GPUDirect TCPX daemon sidecar container
|
|
1180
|
+
- name: tcpx-daemon
|
|
1181
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
|
|
1182
|
+
imagePullPolicy: Always
|
|
1183
|
+
command:
|
|
1184
|
+
- /tcpgpudmarxd/build/app/tcpgpudmarxd
|
|
1185
|
+
- --gpu_nic_preset
|
|
1186
|
+
- a3vm
|
|
1187
|
+
- --gpu_shmem_type
|
|
1188
|
+
- fd
|
|
1189
|
+
- --uds_path
|
|
1190
|
+
- /run/tcpx
|
|
1191
|
+
- --setup_param
|
|
1192
|
+
- --verbose
|
|
1193
|
+
- "128"
|
|
1194
|
+
- "2"
|
|
1195
|
+
- "0"
|
|
1196
|
+
securityContext:
|
|
1197
|
+
capabilities:
|
|
1198
|
+
add:
|
|
1199
|
+
- NET_ADMIN
|
|
1200
|
+
volumeMounts:
|
|
1201
|
+
- name: libraries
|
|
1202
|
+
mountPath: /usr/local/nvidia/lib64
|
|
1203
|
+
readOnly: true
|
|
1204
|
+
- name: tcpx-socket
|
|
1205
|
+
mountPath: /run/tcpx
|
|
1206
|
+
- name: sys
|
|
1207
|
+
mountPath: /hostsysfs
|
|
1208
|
+
- name: proc-sys
|
|
1209
|
+
mountPath: /hostprocsysfs
|
|
1210
|
+
env:
|
|
1211
|
+
- name: LD_LIBRARY_PATH
|
|
1212
|
+
value: /usr/local/nvidia/lib64
|
|
1213
|
+
{% endif %}
|
|
1214
|
+
{% if k8s_enable_gpudirect_tcpxo %}
|
|
1215
|
+
- name: tcpxo-daemon
|
|
1216
|
+
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
|
|
1217
|
+
imagePullPolicy: Always
|
|
1218
|
+
command: ["/bin/sh", "-c"]
|
|
1219
|
+
args:
|
|
1220
|
+
- |
|
|
1221
|
+
set -ex
|
|
1222
|
+
chmod 755 /fts/entrypoint_rxdm_container.sh
|
|
1223
|
+
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
|
|
1224
|
+
securityContext:
|
|
1225
|
+
capabilities:
|
|
1226
|
+
add:
|
|
1227
|
+
- NET_ADMIN
|
|
1228
|
+
- NET_BIND_SERVICE
|
|
1229
|
+
volumeMounts:
|
|
1230
|
+
- name: libraries
|
|
1231
|
+
mountPath: /usr/local/nvidia
|
|
1232
|
+
- name: sys
|
|
1233
|
+
mountPath: /hostsysfs
|
|
1234
|
+
- name: proc-sys
|
|
1235
|
+
mountPath: /hostprocsysfs
|
|
1236
|
+
env:
|
|
1237
|
+
- name: LD_LIBRARY_PATH
|
|
1238
|
+
value: /usr/local/nvidia/lib64
|
|
1239
|
+
{% endif %}
|
|
691
1240
|
|
|
692
1241
|
{% if high_availability %}
|
|
693
1242
|
pvc_spec:
|
|
@@ -724,7 +1273,7 @@ available_node_types:
|
|
|
724
1273
|
spec:
|
|
725
1274
|
securityContext:
|
|
726
1275
|
fsGroup: 1000
|
|
727
|
-
# To prevent the home dir provided by the docker image from being
|
|
1276
|
+
# To prevent the home dir provided by the docker image from being overridden by pvc mounting,
|
|
728
1277
|
# we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
|
|
729
1278
|
initContainers:
|
|
730
1279
|
- name: init-copy-home
|
|
@@ -791,13 +1340,20 @@ setup_commands:
|
|
|
791
1340
|
{%- endfor %}
|
|
792
1341
|
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
|
793
1342
|
start_epoch=$(date +%s);
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
1343
|
+
|
|
1344
|
+
# Wait for SSH setup to complete before proceeding
|
|
1345
|
+
if [ -f /tmp/apt_ssh_setup_started ]; then
|
|
1346
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
|
1347
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1348
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1349
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
800
1350
|
fi
|
|
1351
|
+
|
|
1352
|
+
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
1353
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1354
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1355
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1356
|
+
|
|
801
1357
|
end_epoch=$(date +%s);
|
|
802
1358
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
803
1359
|
start_epoch=$(date +%s);
|