skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/templates/aws-ray.yml.j2
CHANGED
|
@@ -19,7 +19,7 @@ docker:
|
|
|
19
19
|
username: |-
|
|
20
20
|
{{docker_login_config.username}}
|
|
21
21
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
22
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
23
|
server: |-
|
|
24
24
|
{{docker_login_config.server}}
|
|
25
25
|
{%- endif %}
|
|
@@ -48,9 +48,10 @@ provider:
|
|
|
48
48
|
# The upper-level SkyPilot code has make sure there will not be resource
|
|
49
49
|
# leakage.
|
|
50
50
|
disable_launch_config_check: true
|
|
51
|
+
max_efa_interfaces: {{max_efa_interfaces}}
|
|
51
52
|
|
|
52
53
|
auth:
|
|
53
|
-
ssh_user:
|
|
54
|
+
ssh_user: {{ssh_user}}
|
|
54
55
|
ssh_private_key: {{ssh_private_key}}
|
|
55
56
|
{% if ssh_proxy_command is not none %}
|
|
56
57
|
ssh_proxy_command: {{ssh_proxy_command}}
|
|
@@ -68,7 +69,7 @@ available_node_types:
|
|
|
68
69
|
ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
|
|
69
70
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
|
70
71
|
BlockDeviceMappings:
|
|
71
|
-
- DeviceName:
|
|
72
|
+
- DeviceName: {{root_device_name}}
|
|
72
73
|
Ebs:
|
|
73
74
|
VolumeSize: {{disk_size}}
|
|
74
75
|
VolumeType: {{disk_tier}}
|
|
@@ -131,6 +132,12 @@ available_node_types:
|
|
|
131
132
|
- systemctl disable apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service
|
|
132
133
|
- systemctl mask apt-daily.service apt-daily-upgrade.service unattended-upgrades.service
|
|
133
134
|
- systemctl daemon-reload
|
|
135
|
+
{%- if runcmd %}
|
|
136
|
+
runcmd:
|
|
137
|
+
{%- for cmd in runcmd %}
|
|
138
|
+
- {{cmd}}
|
|
139
|
+
{%- endfor %}
|
|
140
|
+
{%- endif %}
|
|
134
141
|
TagSpecifications:
|
|
135
142
|
- ResourceType: instance
|
|
136
143
|
Tags:
|
|
@@ -183,6 +190,7 @@ setup_commands:
|
|
|
183
190
|
{{ conda_installation_commands }}
|
|
184
191
|
conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
|
|
185
192
|
{{ ray_skypilot_installation_commands }}
|
|
193
|
+
{{ copy_skypilot_templates_commands }}
|
|
186
194
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
187
195
|
{%- if docker_image is none %}
|
|
188
196
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/azure-ray.yml.j2
CHANGED
|
@@ -19,7 +19,7 @@ docker:
|
|
|
19
19
|
username: |-
|
|
20
20
|
{{docker_login_config.username}}
|
|
21
21
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
22
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
23
|
server: |-
|
|
24
24
|
{{docker_login_config.server}}
|
|
25
25
|
{%- endif %}
|
|
@@ -118,6 +118,7 @@ setup_commands:
|
|
|
118
118
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
119
119
|
{{ conda_installation_commands }}
|
|
120
120
|
{{ ray_skypilot_installation_commands }}
|
|
121
|
+
{{ copy_skypilot_templates_commands }}
|
|
121
122
|
touch ~/.sudo_as_admin_successful;
|
|
122
123
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
123
124
|
{%- if docker_image is none %}
|
sky/templates/cudo-ray.yml.j2
CHANGED
|
@@ -68,6 +68,7 @@ setup_commands:
|
|
|
68
68
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
69
69
|
{{ conda_installation_commands }}
|
|
70
70
|
{{ ray_skypilot_installation_commands }}
|
|
71
|
+
{{ copy_skypilot_templates_commands }}
|
|
71
72
|
touch ~/.sudo_as_admin_successful;
|
|
72
73
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
73
74
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/do-ray.yml.j2
CHANGED
|
@@ -19,7 +19,7 @@ docker:
|
|
|
19
19
|
username: |-
|
|
20
20
|
{{docker_login_config.username}}
|
|
21
21
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
22
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
23
|
server: |-
|
|
24
24
|
{{docker_login_config.server}}
|
|
25
25
|
{%- endif %}
|
|
@@ -89,6 +89,7 @@ setup_commands:
|
|
|
89
89
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
90
90
|
{{ conda_installation_commands }}
|
|
91
91
|
{{ ray_skypilot_installation_commands }}
|
|
92
|
+
{{ copy_skypilot_templates_commands }}
|
|
92
93
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
93
94
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
94
95
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
@@ -69,6 +69,7 @@ setup_commands:
|
|
|
69
69
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
70
70
|
{{ conda_installation_commands }}
|
|
71
71
|
{{ ray_skypilot_installation_commands }}
|
|
72
|
+
{{ copy_skypilot_templates_commands }}
|
|
72
73
|
touch ~/.sudo_as_admin_successful;
|
|
73
74
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
74
75
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/gcp-ray.yml.j2
CHANGED
|
@@ -5,6 +5,9 @@ max_workers: {{num_nodes - 1}}
|
|
|
5
5
|
upscaling_speed: {{num_nodes - 1}}
|
|
6
6
|
idle_timeout_minutes: 60
|
|
7
7
|
|
|
8
|
+
# The docker password could be a GCP service account key, which is a JSON string
|
|
9
|
+
# that may contain newlines. We need indent(6) to make sure the lines
|
|
10
|
+
# after the first line are properly indented.
|
|
8
11
|
{%- if docker_image is not none %}
|
|
9
12
|
docker:
|
|
10
13
|
image: {{docker_image}}
|
|
@@ -19,7 +22,7 @@ docker:
|
|
|
19
22
|
username: |-
|
|
20
23
|
{{docker_login_config.username}}
|
|
21
24
|
password: |-
|
|
22
|
-
{{docker_login_config.password}}
|
|
25
|
+
{{docker_login_config.password | indent(6) }}
|
|
23
26
|
server: |-
|
|
24
27
|
{{docker_login_config.server}}
|
|
25
28
|
{%- endif %}
|
|
@@ -66,6 +69,15 @@ provider:
|
|
|
66
69
|
{%- if enable_gvnic %}
|
|
67
70
|
enable_gvnic: {{ enable_gvnic }}
|
|
68
71
|
{%- endif %}
|
|
72
|
+
{%- if enable_gpu_direct %}
|
|
73
|
+
enable_gpu_direct: {{ enable_gpu_direct }}
|
|
74
|
+
{%- endif %}
|
|
75
|
+
{%- if placement_policy %}
|
|
76
|
+
placement_policy: {{ placement_policy }}
|
|
77
|
+
{%- endif %}
|
|
78
|
+
{%- if network_tier %}
|
|
79
|
+
network_tier: {{ network_tier }}
|
|
80
|
+
{%- endif %}
|
|
69
81
|
|
|
70
82
|
auth:
|
|
71
83
|
ssh_user: gcpuser
|
|
@@ -100,12 +112,27 @@ available_node_types:
|
|
|
100
112
|
{%- if tpu_vm %}
|
|
101
113
|
acceleratorType: {{tpu_type}}
|
|
102
114
|
runtimeVersion: {{runtime_version}}
|
|
115
|
+
{%- if volumes %}
|
|
116
|
+
dataDisks:
|
|
117
|
+
{%- for volume in volumes %}
|
|
118
|
+
{%- if volume.source %}
|
|
119
|
+
- sourceDisk: {{volume.source}}
|
|
120
|
+
{%- endif %}
|
|
121
|
+
{%- if volume.attach_mode %}
|
|
122
|
+
mode: {{volume.attach_mode}}
|
|
123
|
+
{%- endif %}
|
|
124
|
+
{%- endfor %}
|
|
125
|
+
{%- endif %}
|
|
103
126
|
metadata:
|
|
104
127
|
# TPU VM's metadata has different format than normal VMs.
|
|
105
128
|
# After replacing the variables, this will become username:ssh_public_key_content.
|
|
106
129
|
# This is a specific syntax required by GCP https://cloud.google.com/compute/docs/connect/add-ssh-keys
|
|
107
130
|
ssh-keys: |-
|
|
108
131
|
skypilot:ssh_user:skypilot:ssh_public_key_content
|
|
132
|
+
{%- if user_data is not none %}
|
|
133
|
+
startup-script: |-
|
|
134
|
+
{{ user_data | indent(10) }}
|
|
135
|
+
{%- endif %}
|
|
109
136
|
{%- if use_spot %}
|
|
110
137
|
schedulingConfig:
|
|
111
138
|
preemptible: true
|
|
@@ -129,6 +156,34 @@ available_node_types:
|
|
|
129
156
|
{%- if disk_iops %}
|
|
130
157
|
provisionedIops: {{disk_iops}}
|
|
131
158
|
{%- endif %}
|
|
159
|
+
{%- for volume in volumes %}
|
|
160
|
+
- boot: false
|
|
161
|
+
autoDelete: {{volume.auto_delete}}
|
|
162
|
+
type: {{volume.storage_type}}
|
|
163
|
+
deviceName: {{volume.device_name}}
|
|
164
|
+
{%- if volume.source %}
|
|
165
|
+
source: {{volume.source}}
|
|
166
|
+
{%- endif %}
|
|
167
|
+
{%- if volume.attach_mode %}
|
|
168
|
+
mode: {{volume.attach_mode}}
|
|
169
|
+
{%- endif %}
|
|
170
|
+
{%- if volume.interface_type %}
|
|
171
|
+
interface: {{volume.interface_type}}
|
|
172
|
+
{%- endif %}
|
|
173
|
+
{%- if volume.disk_tier %}
|
|
174
|
+
initializeParams:
|
|
175
|
+
diskType: zones/{{zones}}/diskTypes/{{volume.disk_tier}}
|
|
176
|
+
{%- endif %}
|
|
177
|
+
{%- if volume.disk_name %}
|
|
178
|
+
diskName: {{volume.disk_name}}
|
|
179
|
+
{%- endif %}
|
|
180
|
+
{%- if volume.disk_size %}
|
|
181
|
+
diskSizeGb: {{volume.disk_size}}
|
|
182
|
+
{%- endif %}
|
|
183
|
+
{%- if volume.iops %}
|
|
184
|
+
provisionedIops: {{volume.iops}}
|
|
185
|
+
{%- endif %}
|
|
186
|
+
{%- endfor %}
|
|
132
187
|
{%- if gpu is not none %}
|
|
133
188
|
guestAccelerators:
|
|
134
189
|
- acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
|
|
@@ -145,6 +200,11 @@ available_node_types:
|
|
|
145
200
|
- key: install-nvidia-driver
|
|
146
201
|
value: "True"
|
|
147
202
|
{%- endif %}
|
|
203
|
+
{%- if user_data is not none %}
|
|
204
|
+
- key: user-data
|
|
205
|
+
value: |-
|
|
206
|
+
{{ user_data | indent(14) }}
|
|
207
|
+
{%- endif %}
|
|
148
208
|
{%- if use_spot or gpu is not none %}
|
|
149
209
|
scheduling:
|
|
150
210
|
{%- if use_spot %}
|
|
@@ -216,6 +276,7 @@ setup_commands:
|
|
|
216
276
|
grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc;
|
|
217
277
|
{%- endif %}
|
|
218
278
|
{{ ray_skypilot_installation_commands }}
|
|
279
|
+
{{ copy_skypilot_templates_commands }}
|
|
219
280
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
220
281
|
{%- if docker_image is none %}
|
|
221
282
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Ray cluster config template for Hyperbolic Cloud
|
|
2
|
+
|
|
3
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
4
|
+
|
|
5
|
+
# Hyperbolic only supports a single node (the head node).
|
|
6
|
+
max_workers: 0
|
|
7
|
+
upscaling_speed: 0
|
|
8
|
+
idle_timeout_minutes: 60
|
|
9
|
+
|
|
10
|
+
provider:
|
|
11
|
+
type: external
|
|
12
|
+
module: sky.provision.hyperbolic
|
|
13
|
+
region: "default"
|
|
14
|
+
|
|
15
|
+
auth:
|
|
16
|
+
ssh_user: ubuntu
|
|
17
|
+
ssh_private_key: {{ssh_private_key}}
|
|
18
|
+
|
|
19
|
+
available_node_types:
|
|
20
|
+
ray_head_default:
|
|
21
|
+
resources: {}
|
|
22
|
+
node_config:
|
|
23
|
+
InstanceType: {{instance_type}}
|
|
24
|
+
|
|
25
|
+
head_node_type: ray_head_default
|
|
26
|
+
|
|
27
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
28
|
+
file_mounts: {
|
|
29
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
30
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
31
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
32
|
+
"{{remote_path}}": "{{local_path}}",
|
|
33
|
+
{%- endfor %}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
rsync_exclude: []
|
|
37
|
+
|
|
38
|
+
initialization_commands: []
|
|
39
|
+
|
|
40
|
+
# List of shell commands to run to set up nodes.
|
|
41
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
42
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
43
|
+
# items!
|
|
44
|
+
#
|
|
45
|
+
# Increment the following for catching performance bugs easier:
|
|
46
|
+
# current num items (num SSH connections): 1
|
|
47
|
+
setup_commands:
|
|
48
|
+
# Disable unattended-upgrades and handle apt-get locks
|
|
49
|
+
# Install patch utility for Ray
|
|
50
|
+
# Install conda and Ray
|
|
51
|
+
# Set system limits for Ray performance (nofile and TasksMax)
|
|
52
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
53
|
+
{{ initial_setup_command }}
|
|
54
|
+
{%- endfor %}
|
|
55
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
56
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
57
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
|
58
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
|
59
|
+
sudo pkill -9 apt-get;
|
|
60
|
+
sudo pkill -9 dpkg;
|
|
61
|
+
sudo dpkg --configure -a;
|
|
62
|
+
which patch > /dev/null || sudo apt install -y patch;
|
|
63
|
+
{{ conda_installation_commands }}
|
|
64
|
+
{{ ray_skypilot_installation_commands }}
|
|
65
|
+
{{ copy_skypilot_templates_commands }}
|
|
66
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
67
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
68
|
+
{{ ssh_max_sessions_config }}
|
sky/templates/ibm-ray.yml.j2
CHANGED
|
@@ -102,6 +102,7 @@ setup_commands:
|
|
|
102
102
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
103
103
|
{{ conda_installation_commands }}
|
|
104
104
|
{{ ray_skypilot_installation_commands }}
|
|
105
|
+
{{ copy_skypilot_templates_commands }}
|
|
105
106
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
106
107
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
107
108
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -121,7 +122,7 @@ head_start_ray_commands:
|
|
|
121
122
|
# all the sessions to be reloaded. This is a workaround.
|
|
122
123
|
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
|
123
124
|
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
|
124
|
-
{{dump_port_command}}
|
|
125
|
+
{{dump_port_command}} {{ray_head_wait_initialized_command}}
|
|
125
126
|
|
|
126
127
|
{%- if num_nodes > 1 %}
|
|
127
128
|
worker_start_ray_commands:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
name: {{dag_name}}
|
|
4
4
|
|
|
5
5
|
file_mounts:
|
|
6
|
+
{{remote_original_user_yaml_path}}: {{original_user_dag_path}}
|
|
6
7
|
{{remote_user_yaml_path}}: {{user_yaml_path}}
|
|
7
8
|
{%- if local_user_config_path is not none %}
|
|
8
9
|
{{remote_user_config_path}}: {{local_user_config_path}}
|
|
@@ -14,6 +15,12 @@ file_mounts:
|
|
|
14
15
|
{{controller_file_mount_path}}: {{local_file_mount_path}}
|
|
15
16
|
{%- endfor %}
|
|
16
17
|
|
|
18
|
+
# NOTE(dev): This needs to be a subset of sky/templates/sky-serve-controller.yaml.j2.
|
|
19
|
+
# It is because we use the --fast flag to submit jobs and no --fast flag to launch pools.
|
|
20
|
+
# So when we launch a new pool, it will install the required dependencies.
|
|
21
|
+
# TODO(tian): Add --fast to launch pools as well, and figure out the dependency installation.
|
|
22
|
+
# Maybe in the --fast implementation, we can store the hash of setup commands that used to be
|
|
23
|
+
# run and don't skip setup phase if the hash is different.
|
|
17
24
|
setup: |
|
|
18
25
|
{{ sky_activate_python_env }}
|
|
19
26
|
# Disable the pip version check to avoid the warning message, which makes the
|
|
@@ -29,31 +36,13 @@ setup: |
|
|
|
29
36
|
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
|
30
37
|
{% endif %}
|
|
31
38
|
|
|
32
|
-
#
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# Create systemd user service file
|
|
36
|
-
cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
|
|
37
|
-
[Unit]
|
|
38
|
-
Description=SkyPilot Jobs Dashboard
|
|
39
|
-
After=network.target
|
|
40
|
-
|
|
41
|
-
[Service]
|
|
42
|
-
Environment="SKYPILOT_USER_ID={{ dashboard_user_id }}"
|
|
43
|
-
Restart=always
|
|
44
|
-
StandardOutput=append:/home/$USER/.sky/job-dashboard.log
|
|
45
|
-
StandardError=append:/home/$USER/.sky/job-dashboard.log
|
|
46
|
-
ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
|
|
47
|
-
|
|
48
|
-
[Install]
|
|
49
|
-
WantedBy=default.target
|
|
50
|
-
EOF
|
|
51
|
-
|
|
52
|
-
export SKYPILOT_USER_ID="{{ dashboard_user_id }}"
|
|
53
|
-
{{ dashboard_setup_cmd }}
|
|
39
|
+
# This is used by the skylet events to check if we are a jobs controller.
|
|
40
|
+
touch {{job_controller_indicator_file}}
|
|
54
41
|
|
|
55
42
|
run: |
|
|
43
|
+
{%- if consolidation_mode_job_id is none %}
|
|
56
44
|
{{ sky_activate_python_env }}
|
|
45
|
+
{%- endif %}
|
|
57
46
|
|
|
58
47
|
# Write env vars to a file
|
|
59
48
|
{%- for env_name, env_value in controller_envs.items() %}
|
|
@@ -64,9 +53,23 @@ run: |
|
|
|
64
53
|
# Note: The job is already in the `spot` table, marked as PENDING.
|
|
65
54
|
# CloudVmRayBackend._exec_code_on_head() calls
|
|
66
55
|
# managed_job_codegen.set_pending() before we get here.
|
|
67
|
-
|
|
56
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
57
|
+
{{sky_python_cmd}} \
|
|
58
|
+
{%- else %}
|
|
59
|
+
python \
|
|
60
|
+
{%- endif %}
|
|
61
|
+
-u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
|
62
|
+
--user-yaml-path {{remote_original_user_yaml_path}} \
|
|
63
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
64
|
+
--job-id {{consolidation_mode_job_id}} \
|
|
65
|
+
{%- else %}
|
|
68
66
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
|
69
|
-
|
|
67
|
+
{%- endif %}
|
|
68
|
+
--env-file {{remote_env_file_path}} \
|
|
69
|
+
{%- if pool is not none %}
|
|
70
|
+
--pool {{pool}} \
|
|
71
|
+
{%- endif %}
|
|
72
|
+
--priority {{priority}}
|
|
70
73
|
|
|
71
74
|
|
|
72
75
|
envs:
|
|
@@ -12,6 +12,8 @@ service_spec:
|
|
|
12
12
|
{%- for key, value in annotations.items() %}
|
|
13
13
|
{{ key }}: {{ value|tojson }}
|
|
14
14
|
{%- endfor %}
|
|
15
|
+
{# Note: It's ok to add cloud-specific annotations here since they will be ignored by other clouds #}
|
|
16
|
+
service.beta.kubernetes.io/coreweave-load-balancer-type: public
|
|
15
17
|
spec:
|
|
16
18
|
type: LoadBalancer
|
|
17
19
|
selector:
|