skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/templates/lambda-ray.yml.j2
CHANGED
|
@@ -19,7 +19,7 @@ docker:
|
|
|
19
19
|
username: |-
|
|
20
20
|
{{docker_login_config.username}}
|
|
21
21
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
22
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
23
|
server: |-
|
|
24
24
|
{{docker_login_config.server}}
|
|
25
25
|
{%- endif %}
|
|
@@ -91,6 +91,7 @@ setup_commands:
|
|
|
91
91
|
rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
|
|
92
92
|
{{ conda_installation_commands }}
|
|
93
93
|
{{ ray_skypilot_installation_commands }}
|
|
94
|
+
{{ copy_skypilot_templates_commands }}
|
|
94
95
|
touch ~/.sudo_as_admin_successful;
|
|
95
96
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
96
97
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -9,6 +9,8 @@ provider:
|
|
|
9
9
|
type: external
|
|
10
10
|
module: sky.provision.nebius
|
|
11
11
|
region: "{{region}}"
|
|
12
|
+
use_internal_ips: {{use_internal_ips}}
|
|
13
|
+
use_static_ip_address: {{ use_static_ip_address }}
|
|
12
14
|
|
|
13
15
|
{%- if docker_image is not none %}
|
|
14
16
|
docker:
|
|
@@ -24,7 +26,7 @@ docker:
|
|
|
24
26
|
username: |-
|
|
25
27
|
{{docker_login_config.username}}
|
|
26
28
|
password: |-
|
|
27
|
-
{{docker_login_config.password}}
|
|
29
|
+
{{docker_login_config.password | indent(6) }}
|
|
28
30
|
server: |-
|
|
29
31
|
{{docker_login_config.server}}
|
|
30
32
|
{%- endif %}
|
|
@@ -34,6 +36,9 @@ docker:
|
|
|
34
36
|
auth:
|
|
35
37
|
ssh_user: ubuntu
|
|
36
38
|
ssh_private_key: {{ssh_private_key}}
|
|
39
|
+
{% if ssh_proxy_command is not none %}
|
|
40
|
+
ssh_proxy_command: {{ssh_proxy_command}}
|
|
41
|
+
{% endif %}
|
|
37
42
|
|
|
38
43
|
available_node_types:
|
|
39
44
|
ray_head_default:
|
|
@@ -42,18 +47,21 @@ available_node_types:
|
|
|
42
47
|
InstanceType: {{instance_type}}
|
|
43
48
|
ImageId: {{image_id}}
|
|
44
49
|
DiskSize: {{disk_size}}
|
|
50
|
+
use_spot: {{ use_spot }}
|
|
51
|
+
network_tier: {{network_tier}}
|
|
52
|
+
filesystems:
|
|
53
|
+
{%- for fs in filesystems %}
|
|
54
|
+
- filesystem_id: {{ fs.filesystem_id }}
|
|
55
|
+
filesystem_mount_tag: {{ fs.filesystem_mount_tag }}
|
|
56
|
+
filesystem_attach_mode: {{ fs.filesystem_attach_mode }}
|
|
57
|
+
filesystem_mount_path: {{ fs.filesystem_mount_path }}
|
|
58
|
+
{%- endfor %}
|
|
45
59
|
UserData: |
|
|
46
|
-
{%- if docker_image is not none %}
|
|
47
|
-
runcmd:
|
|
48
|
-
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
|
|
49
|
-
- systemctl restart sshd
|
|
50
|
-
{%- endif %}
|
|
51
|
-
|
|
52
60
|
{# Two available OS images:
|
|
53
|
-
1.
|
|
54
|
-
2.
|
|
55
|
-
To optimize deployment speed, Docker is only installed when using
|
|
56
|
-
{%- if docker_image is not none and image_id
|
|
61
|
+
1. ubuntu24.04-driverless - requires Docker installation
|
|
62
|
+
2. ubuntu24.04-cuda12 - comes with Docker pre-installed
|
|
63
|
+
To optimize deployment speed, Docker is only installed when using ubuntu24.04-driverless #}
|
|
64
|
+
{%- if docker_image is not none and image_id.endswith('-driverless') %}
|
|
57
65
|
apt:
|
|
58
66
|
sources:
|
|
59
67
|
docker.list:
|
|
@@ -101,6 +109,7 @@ file_mounts: {
|
|
|
101
109
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
102
110
|
{%- for remote_path, local_path in credentials.items() %}
|
|
103
111
|
"{{remote_path}}": "{{local_path}}",
|
|
112
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
104
113
|
{%- endfor %}
|
|
105
114
|
}
|
|
106
115
|
|
|
@@ -116,6 +125,7 @@ initialization_commands: []
|
|
|
116
125
|
# Increment the following for catching performance bugs easier:
|
|
117
126
|
# current num items (num SSH connections): 1
|
|
118
127
|
setup_commands:
|
|
128
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
|
119
129
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
|
120
130
|
# Create ~/.ssh/config file in case the file does not exist in the image.
|
|
121
131
|
# Line 'rm ..': there is another installation of pip.
|
|
@@ -126,6 +136,11 @@ setup_commands:
|
|
|
126
136
|
- {%- for initial_setup_command in initial_setup_commands %}
|
|
127
137
|
{{ initial_setup_command }}
|
|
128
138
|
{%- endfor %}
|
|
139
|
+
{%- for fs in filesystems %}
|
|
140
|
+
sudo mkdir {{ fs.filesystem_mount_path }};
|
|
141
|
+
sudo mount -t virtiofs {{ fs.filesystem_mount_tag }} {{ fs.filesystem_mount_path }};
|
|
142
|
+
sudo chmod a+w {{ fs.filesystem_mount_path }};
|
|
143
|
+
{%- endfor %}
|
|
129
144
|
sudo systemctl stop unattended-upgrades || true;
|
|
130
145
|
sudo systemctl disable unattended-upgrades || true;
|
|
131
146
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
|
@@ -136,8 +151,15 @@ setup_commands:
|
|
|
136
151
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
137
152
|
{{ conda_installation_commands }}
|
|
138
153
|
{{ ray_skypilot_installation_commands }}
|
|
154
|
+
{{ copy_skypilot_templates_commands }}
|
|
155
|
+
{%- if env_vars is defined %}
|
|
156
|
+
{%- for env_var, env_value in env_vars.items() %}
|
|
157
|
+
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
158
|
+
{%- endfor %}
|
|
159
|
+
{%- endif %}
|
|
160
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
139
161
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
140
162
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
141
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
163
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
142
164
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
|
143
165
|
{{ ssh_max_sessions_config }}
|
sky/templates/oci-ray.yml.j2
CHANGED
|
@@ -85,6 +85,7 @@ setup_commands:
|
|
|
85
85
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
86
86
|
{{ conda_installation_commands }}
|
|
87
87
|
{{ ray_skypilot_installation_commands }}
|
|
88
|
+
{{ copy_skypilot_templates_commands }}
|
|
88
89
|
touch ~/.sudo_as_admin_successful;
|
|
89
90
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
90
91
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
@@ -19,7 +19,7 @@ docker:
|
|
|
19
19
|
username: |-
|
|
20
20
|
{{docker_login_config.username}}
|
|
21
21
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
22
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
23
|
server: |-
|
|
24
24
|
{{docker_login_config.server}}
|
|
25
25
|
{%- endif %}
|
|
@@ -87,6 +87,7 @@ setup_commands:
|
|
|
87
87
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
88
88
|
{{ conda_installation_commands }}
|
|
89
89
|
{{ ray_skypilot_installation_commands }}
|
|
90
|
+
{{ copy_skypilot_templates_commands }}
|
|
90
91
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
91
92
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
92
93
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.primeintellect
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
zones: "{{zones}}"
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: skypilot:ssh_user
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
|
|
18
|
+
available_node_types:
|
|
19
|
+
ray_head_default:
|
|
20
|
+
resources: {}
|
|
21
|
+
node_config:
|
|
22
|
+
InstanceType: {{instance_type}}
|
|
23
|
+
DiskSize: {{disk_size}}
|
|
24
|
+
ImageId: {{image_id}}
|
|
25
|
+
PublicKey: |-
|
|
26
|
+
skypilot:ssh_public_key_content
|
|
27
|
+
|
|
28
|
+
head_node_type: ray_head_default
|
|
29
|
+
|
|
30
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
31
|
+
file_mounts: {
|
|
32
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
33
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
34
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
35
|
+
"{{remote_path}}": "{{local_path}}",
|
|
36
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
37
|
+
{%- endfor %}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
rsync_exclude: []
|
|
41
|
+
|
|
42
|
+
initialization_commands: []
|
|
43
|
+
|
|
44
|
+
# List of shell commands to run to set up nodes.
|
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
47
|
+
# items!
|
|
48
|
+
#
|
|
49
|
+
# Increment the following for catching performance bugs easier:
|
|
50
|
+
# current num items (num SSH connections): 1
|
|
51
|
+
setup_commands:
|
|
52
|
+
# Disable unattended-upgrades and handle apt-get locks
|
|
53
|
+
# Install patch utility for Ray
|
|
54
|
+
# Install conda and Ray
|
|
55
|
+
# Set system limits for Ray performance (nofile and TasksMax)
|
|
56
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
57
|
+
{{ initial_setup_command }}
|
|
58
|
+
{%- endfor %}
|
|
59
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
60
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
61
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
|
62
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
|
63
|
+
sudo pkill -9 apt-get;
|
|
64
|
+
sudo pkill -9 dpkg;
|
|
65
|
+
sudo dpkg --configure -a;
|
|
66
|
+
which patch > /dev/null || sudo apt install -y patch;
|
|
67
|
+
{{ conda_installation_commands }}
|
|
68
|
+
{{ ray_skypilot_installation_commands }}
|
|
69
|
+
{{ copy_skypilot_templates_commands }}
|
|
70
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
71
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
72
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/runpod-ray.yml.j2
CHANGED
|
@@ -20,7 +20,7 @@ provider:
|
|
|
20
20
|
username: |-
|
|
21
21
|
{{docker_login_config.username}}
|
|
22
22
|
password: |-
|
|
23
|
-
{{docker_login_config.password}}
|
|
23
|
+
{{docker_login_config.password | indent(6) }}
|
|
24
24
|
server: |-
|
|
25
25
|
{{docker_login_config.server}}
|
|
26
26
|
{%- endif %}
|
|
@@ -40,6 +40,14 @@ available_node_types:
|
|
|
40
40
|
skypilot:ssh_public_key_content
|
|
41
41
|
Preemptible: {{use_spot}}
|
|
42
42
|
BidPerGPU: {{bid_per_gpu}}
|
|
43
|
+
{%- if volume_mounts and volume_mounts|length > 0 %}
|
|
44
|
+
VolumeMounts:
|
|
45
|
+
{%- for vm in volume_mounts %}
|
|
46
|
+
- VolumeNameOnCloud: {{ vm.volume_name_on_cloud }}
|
|
47
|
+
VolumeIdOnCloud: {{ vm.volume_id_on_cloud }}
|
|
48
|
+
MountPath: {{ vm.path }}
|
|
49
|
+
{%- endfor %}
|
|
50
|
+
{%- endif %}
|
|
43
51
|
|
|
44
52
|
head_node_type: ray_head_default
|
|
45
53
|
|
|
@@ -85,6 +93,7 @@ setup_commands:
|
|
|
85
93
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
86
94
|
{{ conda_installation_commands }}
|
|
87
95
|
{{ ray_skypilot_installation_commands }}
|
|
96
|
+
{{ copy_skypilot_templates_commands }}
|
|
88
97
|
touch ~/.sudo_as_admin_successful;
|
|
89
98
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
90
99
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/scp-ray.yml.j2
CHANGED
|
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
|
|
|
7
7
|
|
|
8
8
|
provider:
|
|
9
9
|
type: external
|
|
10
|
-
module: sky.
|
|
10
|
+
module: sky.provision.scp
|
|
11
11
|
region: {{region}}
|
|
12
12
|
cache_stopped_nodes: True
|
|
13
13
|
|
|
@@ -24,19 +24,6 @@ available_node_types:
|
|
|
24
24
|
InstanceType: {{instance_type}}
|
|
25
25
|
imageId: {{image_id}}
|
|
26
26
|
diskSize: {{disk_size}}
|
|
27
|
-
{% if num_nodes > 1 %}
|
|
28
|
-
ray_worker_default:
|
|
29
|
-
min_workers: {{num_nodes - 1}}
|
|
30
|
-
max_workers: {{num_nodes - 1}}
|
|
31
|
-
resources: {}
|
|
32
|
-
node_config:
|
|
33
|
-
AuthorizedKey: |
|
|
34
|
-
skypilot:ssh_public_key_content
|
|
35
|
-
InstanceType: {{instance_type}}
|
|
36
|
-
imageId: {{image_id}}
|
|
37
|
-
diskSize: {{disk_size}}
|
|
38
|
-
|
|
39
|
-
{%- endif %}
|
|
40
27
|
|
|
41
28
|
head_node_type: ray_head_default
|
|
42
29
|
|
|
@@ -50,10 +37,6 @@ file_mounts: {
|
|
|
50
37
|
{%- endfor %}
|
|
51
38
|
}
|
|
52
39
|
|
|
53
|
-
rsync_exclude: []
|
|
54
|
-
|
|
55
|
-
initialization_commands: []
|
|
56
|
-
|
|
57
40
|
# List of shell commands to run to set up nodes.
|
|
58
41
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
59
42
|
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
@@ -73,40 +56,11 @@ setup_commands:
|
|
|
73
56
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
74
57
|
{{ conda_installation_commands }}
|
|
75
58
|
{{ ray_skypilot_installation_commands }}
|
|
59
|
+
{{ copy_skypilot_templates_commands }}
|
|
76
60
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
77
61
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
78
62
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
79
63
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
|
80
|
-
{{ ssh_max_sessions_config }}
|
|
81
|
-
|
|
82
|
-
# Command to start ray on the head node. You don't need to change this.
|
|
83
|
-
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
84
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
85
|
-
# items! The same comment applies for worker_start_ray_commands.
|
|
86
|
-
#
|
|
87
|
-
# Increment the following for catching performance bugs easier:
|
|
88
|
-
# current num items (num SSH connections): 1
|
|
89
|
-
head_start_ray_commands:
|
|
90
|
-
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
|
|
91
|
-
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
|
|
92
|
-
# all the sessions to be reloaded. This is a workaround.
|
|
93
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
|
94
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
|
95
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
|
96
|
-
|
|
97
|
-
{%- if num_nodes > 1 %}
|
|
98
|
-
worker_start_ray_commands:
|
|
99
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
|
100
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
|
101
|
-
{%- else %}
|
|
102
|
-
worker_start_ray_commands: []
|
|
103
|
-
{%- endif %}
|
|
104
|
-
|
|
105
|
-
head_node: {}
|
|
106
|
-
worker_nodes: {}
|
|
107
64
|
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
worker_setup_commands: []
|
|
111
|
-
cluster_synced_files: []
|
|
112
|
-
file_mounts_sync_continuously: False
|
|
65
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
|
66
|
+
# We do not need to list it here anymore.
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
cluster_name: {{ cluster_name_on_cloud }}
|
|
2
|
+
|
|
3
|
+
max_workers: {{ num_nodes - 1 }}
|
|
4
|
+
upscaling_speed: {{ num_nodes - 1 }}
|
|
5
|
+
idle_timeout_minutes: 5
|
|
6
|
+
|
|
7
|
+
{%- if docker_image is not none %}
|
|
8
|
+
docker:
|
|
9
|
+
image: {{docker_image}}
|
|
10
|
+
container_name: {{docker_container_name}}
|
|
11
|
+
run_options:
|
|
12
|
+
- --ulimit nofile=1048576:1048576
|
|
13
|
+
{%- for run_option in docker_run_options %}
|
|
14
|
+
- {{run_option}}
|
|
15
|
+
{%- endfor %}
|
|
16
|
+
{%- if docker_login_config is not none %}
|
|
17
|
+
docker_login_config:
|
|
18
|
+
username: |-
|
|
19
|
+
{{docker_login_config.username}}
|
|
20
|
+
password: |-
|
|
21
|
+
{{docker_login_config.password | indent(6) }}
|
|
22
|
+
server: |-
|
|
23
|
+
{{docker_login_config.server}}
|
|
24
|
+
{%- endif %}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
|
|
27
|
+
provider:
|
|
28
|
+
type: external
|
|
29
|
+
module: sky.provision.seeweb
|
|
30
|
+
region: "{{ region }}"
|
|
31
|
+
|
|
32
|
+
auth:
|
|
33
|
+
ssh_user: ecuser
|
|
34
|
+
ssh_private_key: {{ ssh_private_key }}
|
|
35
|
+
|
|
36
|
+
available_node_types:
|
|
37
|
+
ray_head_default:
|
|
38
|
+
resources: {}
|
|
39
|
+
node_config:
|
|
40
|
+
plan: {{ instance_type }}
|
|
41
|
+
image: {{ image_id }}
|
|
42
|
+
location: {{ region }}
|
|
43
|
+
{% if seeweb_gpu_config is not none %}
|
|
44
|
+
gpu: {{ seeweb_gpu_config.gpu }}
|
|
45
|
+
gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
|
|
46
|
+
{% endif %}
|
|
47
|
+
disk: {{ disk_size }}
|
|
48
|
+
{% if docker_image is not none %}
|
|
49
|
+
user_customize: |
|
|
50
|
+
#!/bin/bash
|
|
51
|
+
# Auto-generated Docker installation script for Seeweb
|
|
52
|
+
LOG_FILE=/var/log/user_customize.log
|
|
53
|
+
sudo mkdir -p "$(dirname "$LOG_FILE")"
|
|
54
|
+
{
|
|
55
|
+
echo "[$(date -Is)] Cloud script: start"
|
|
56
|
+
sudo apt-get update
|
|
57
|
+
sudo apt-get install -y \
|
|
58
|
+
apt-transport-https \
|
|
59
|
+
ca-certificates \
|
|
60
|
+
curl \
|
|
61
|
+
gnupg-agent \
|
|
62
|
+
lsb-release \
|
|
63
|
+
software-properties-common
|
|
64
|
+
sudo mkdir -p /usr/share/keyrings
|
|
65
|
+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
|
66
|
+
sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
|
|
67
|
+
UBU_CODENAME="$(. /etc/os-release && echo "$VERSION_CODENAME")"
|
|
68
|
+
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu ${UBU_CODENAME} stable" | \
|
|
69
|
+
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
|
70
|
+
sudo apt-get update
|
|
71
|
+
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
|
72
|
+
echo "[$(date -Is)] Cloud script: docker installed"
|
|
73
|
+
sudo usermod -aG docker ecuser || true
|
|
74
|
+
sudo systemctl enable docker || true
|
|
75
|
+
sudo systemctl start docker || true
|
|
76
|
+
command -v docker && docker --version || echo "[$(date -Is)] docker still missing"
|
|
77
|
+
echo "[$(date -Is)] Cloud script: complete"
|
|
78
|
+
} | sudo tee -a "$LOG_FILE"
|
|
79
|
+
sudo touch /var/log/docker_install_done
|
|
80
|
+
{% endif %}
|
|
81
|
+
|
|
82
|
+
head_node_type: ray_head_default
|
|
83
|
+
|
|
84
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
85
|
+
file_mounts: {
|
|
86
|
+
"~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
|
|
87
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
88
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
89
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
90
|
+
"{{remote_path}}": "{{local_path}}",
|
|
91
|
+
{%- endfor %}
|
|
92
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
rsync_exclude: []
|
|
96
|
+
|
|
97
|
+
setup_commands:
|
|
98
|
+
- |
|
|
99
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
|
100
|
+
{{ initial_setup_command }}
|
|
101
|
+
{%- endfor %}
|
|
102
|
+
touch ~/.bashrc;
|
|
103
|
+
echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
|
|
104
|
+
echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
|
|
105
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
106
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
107
|
+
sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
|
|
108
|
+
|
|
109
|
+
{%- if docker_image is not none %}
|
|
110
|
+
# Docker installed via cloud-init; ensure service will be started by cloud-init
|
|
111
|
+
{%- endif %}
|
|
112
|
+
|
|
113
|
+
{{ conda_installation_commands }}
|
|
114
|
+
{{ ray_skypilot_installation_commands }}
|
|
115
|
+
{{ copy_skypilot_templates_commands }}
|
|
116
|
+
|
|
117
|
+
head_start_ray_commands:
|
|
118
|
+
- |
|
|
119
|
+
retry_ray() {
|
|
120
|
+
local n=0; local max=30
|
|
121
|
+
until [ $n -ge $max ]; do
|
|
122
|
+
export SKYPILOT_NUM_GPUS=0
|
|
123
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
124
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
125
|
+
|
|
126
|
+
ray stop || true
|
|
127
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
128
|
+
ray start --disable-usage-stats --head \
|
|
129
|
+
--port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
|
|
130
|
+
--object-manager-port=8076 \
|
|
131
|
+
--autoscaling-config=~/ray_bootstrap_config.yaml \
|
|
132
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
133
|
+
|
|
134
|
+
echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
135
|
+
sleep 5
|
|
136
|
+
done
|
|
137
|
+
[ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
|
|
138
|
+
}
|
|
139
|
+
retry_ray
|
|
140
|
+
|
|
141
|
+
worker_start_ray_commands:
|
|
142
|
+
- |
|
|
143
|
+
retry_ray() {
|
|
144
|
+
local n=0; local max=30
|
|
145
|
+
until [ $n -ge $max ]; do
|
|
146
|
+
SKYPILOT_NUM_GPUS=0
|
|
147
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
148
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
149
|
+
|
|
150
|
+
ray stop || true
|
|
151
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
152
|
+
ray start --disable-usage-stats \
|
|
153
|
+
--address=$RAY_HEAD_IP:{{ ray_port }} \
|
|
154
|
+
--object-manager-port=8076 \
|
|
155
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
156
|
+
|
|
157
|
+
echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
158
|
+
sleep 5
|
|
159
|
+
done
|
|
160
|
+
[ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
|
|
161
|
+
}
|
|
162
|
+
retry_ray
|
|
163
|
+
|
|
164
|
+
head_node: {}
|
|
165
|
+
worker_nodes: {}
|
|
166
|
+
|
|
167
|
+
head_setup_commands: []
|
|
168
|
+
worker_setup_commands: []
|
|
169
|
+
|
|
170
|
+
cluster_synced_files: []
|
|
171
|
+
file_mounts_sync_continuously: False
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.shadeform
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
disable_launch_config_check: true
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: shadeform
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
ssh_key_id: {{ssh_key_id}}
|
|
18
|
+
|
|
19
|
+
available_node_types:
|
|
20
|
+
ray_head_default:
|
|
21
|
+
{%- if custom_resources %}
|
|
22
|
+
resources: {{custom_resources}}
|
|
23
|
+
{%- else %}
|
|
24
|
+
resources: {}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
node_config:
|
|
27
|
+
InstanceType: {{instance_type}}
|
|
28
|
+
PublicKey: |-
|
|
29
|
+
skypilot:ssh_public_key_content
|
|
30
|
+
|
|
31
|
+
head_node_type: ray_head_default
|
|
32
|
+
|
|
33
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
34
|
+
file_mounts: {
|
|
35
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
36
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
37
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
38
|
+
"{{remote_path}}": "{{local_path}}",
|
|
39
|
+
{%- endfor %}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
rsync_exclude: []
|
|
43
|
+
|
|
44
|
+
initialization_commands: []
|
|
45
|
+
|
|
46
|
+
# List of shell commands to run to set up nodes.
|
|
47
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
48
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
49
|
+
# items!
|
|
50
|
+
#
|
|
51
|
+
# Increment the following for catching performance bugs easier:
|
|
52
|
+
# current num items (num SSH connections): 1
|
|
53
|
+
setup_commands:
|
|
54
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
|
55
|
+
# Line 'rm ..': there is another installation of pip.
|
|
56
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
|
57
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
|
58
|
+
# Line 'mkdir -p ..': disable host key check
|
|
59
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
|
60
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
61
|
+
{{ initial_setup_command }}
|
|
62
|
+
{%- endfor %}
|
|
63
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
64
|
+
{{ conda_installation_commands }}
|
|
65
|
+
{{ ray_skypilot_installation_commands }}
|
|
66
|
+
{{ copy_skypilot_templates_commands }}
|
|
67
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
68
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
69
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
70
|
+
{{ ssh_max_sessions_config }}
|
|
71
|
+
|
|
72
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
|
73
|
+
# We do not need to list it here anymore.
|
|
@@ -34,6 +34,9 @@ file_mounts:
|
|
|
34
34
|
{%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
|
|
35
35
|
{{remote_catalog_path}}: {{local_catalog_path}}
|
|
36
36
|
{%- endfor %}
|
|
37
|
+
{%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
|
|
38
|
+
{{controller_file_mount_path}}: {{local_file_mount_path}}
|
|
39
|
+
{%- endfor %}
|
|
37
40
|
{%- if use_tls %}
|
|
38
41
|
{{remote_tls_keyfile}}: {{local_tls_keyfile}}
|
|
39
42
|
{{remote_tls_certfile}}: {{local_tls_certfile}}
|
|
@@ -42,13 +45,30 @@ file_mounts:
|
|
|
42
45
|
run: |
|
|
43
46
|
# Activate the Python environment, so that cloud SDKs can be found in the
|
|
44
47
|
# PATH.
|
|
48
|
+
{%- if consolidation_mode_job_id is none %}
|
|
45
49
|
{{ sky_activate_python_env }}
|
|
50
|
+
{%- endif %}
|
|
46
51
|
# Start sky serve service.
|
|
47
|
-
|
|
52
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
53
|
+
{{sky_python_cmd}} \
|
|
54
|
+
{%- else %}
|
|
55
|
+
python \
|
|
56
|
+
{%- endif %}
|
|
57
|
+
-u -m sky.serve.service \
|
|
48
58
|
--service-name {{service_name}} \
|
|
49
59
|
--task-yaml {{remote_task_yaml_path}} \
|
|
60
|
+
--entrypoint {{entrypoint}} \
|
|
61
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
62
|
+
--job-id {{consolidation_mode_job_id}} \
|
|
63
|
+
{%- else %}
|
|
50
64
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
|
51
|
-
|
|
65
|
+
{%- endif %}
|
|
66
|
+
>> {{controller_log_file}} 2>&1 \
|
|
67
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
68
|
+
&
|
|
69
|
+
{%- endif %}
|
|
70
|
+
# For consolidation mode, we need to run the service in the background so
|
|
71
|
+
# that it can immediately return in serve.core.up().
|
|
52
72
|
|
|
53
73
|
envs:
|
|
54
74
|
{%- for env_name, env_value in controller_envs.items() %}
|
sky/templates/vast-ray.yml.j2
CHANGED
|
@@ -61,6 +61,7 @@ setup_commands:
|
|
|
61
61
|
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
62
62
|
{{ conda_installation_commands }}
|
|
63
63
|
{{ ray_skypilot_installation_commands }}
|
|
64
|
+
{{ copy_skypilot_templates_commands }}
|
|
64
65
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
65
66
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
66
67
|
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
sky/templates/vsphere-ray.yml.j2
CHANGED
|
@@ -67,6 +67,7 @@ setup_commands:
|
|
|
67
67
|
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
|
|
68
68
|
{{ conda_installation_commands }}
|
|
69
69
|
{{ ray_skypilot_installation_commands }}
|
|
70
|
+
{{ copy_skypilot_templates_commands }}
|
|
70
71
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
71
72
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
72
73
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|