skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/controller_utils.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
-
import getpass
|
|
6
5
|
import os
|
|
7
6
|
import tempfile
|
|
8
7
|
import typing
|
|
9
|
-
from typing import Any, Dict, Iterable, List, Optional, Set
|
|
8
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Set
|
|
10
9
|
import uuid
|
|
11
10
|
|
|
12
11
|
import colorama
|
|
@@ -24,10 +23,14 @@ from sky.clouds import gcp
|
|
|
24
23
|
from sky.data import data_utils
|
|
25
24
|
from sky.data import storage as storage_lib
|
|
26
25
|
from sky.jobs import constants as managed_job_constants
|
|
26
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
27
|
from sky.serve import constants as serve_constants
|
|
28
|
+
from sky.serve import serve_state
|
|
29
|
+
from sky.server import config as server_config
|
|
28
30
|
from sky.setup_files import dependencies
|
|
29
31
|
from sky.skylet import constants
|
|
30
32
|
from sky.skylet import log_lib
|
|
33
|
+
from sky.utils import annotations
|
|
31
34
|
from sky.utils import common
|
|
32
35
|
from sky.utils import common_utils
|
|
33
36
|
from sky.utils import config_utils
|
|
@@ -35,10 +38,16 @@ from sky.utils import env_options
|
|
|
35
38
|
from sky.utils import registry
|
|
36
39
|
from sky.utils import rich_utils
|
|
37
40
|
from sky.utils import ux_utils
|
|
41
|
+
from sky.utils import yaml_utils
|
|
38
42
|
|
|
39
43
|
if typing.TYPE_CHECKING:
|
|
44
|
+
import psutil
|
|
45
|
+
|
|
40
46
|
from sky import task as task_lib
|
|
41
47
|
from sky.backends import cloud_vm_ray_backend
|
|
48
|
+
else:
|
|
49
|
+
from sky.adaptors import common as adaptors_common
|
|
50
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
42
51
|
|
|
43
52
|
logger = sky_logging.init_logger(__name__)
|
|
44
53
|
|
|
@@ -63,8 +72,9 @@ class _ControllerSpec:
|
|
|
63
72
|
"""Spec for skypilot controllers."""
|
|
64
73
|
controller_type: str
|
|
65
74
|
name: str
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
_cluster_name_func: Callable[[], str]
|
|
76
|
+
_cluster_name_from_server: Optional[str] # For client-side only
|
|
77
|
+
in_progress_hint: Callable[[bool], str]
|
|
68
78
|
decline_cancel_hint: str
|
|
69
79
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
70
80
|
decline_down_for_dirty_controller_hint: str
|
|
@@ -84,6 +94,24 @@ class _ControllerSpec:
|
|
|
84
94
|
return self._check_cluster_name_hint.format(
|
|
85
95
|
cluster_name=self.cluster_name)
|
|
86
96
|
|
|
97
|
+
@property
|
|
98
|
+
def cluster_name(self) -> str:
|
|
99
|
+
"""The cluster name of the controller.
|
|
100
|
+
|
|
101
|
+
On the server-side, the cluster name is the actual cluster name,
|
|
102
|
+
which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
|
|
103
|
+
|
|
104
|
+
On the client-side, the cluster name may not be accurate,
|
|
105
|
+
as we may not know the exact name, because we are missing
|
|
106
|
+
the server-side common.SERVER_ID. We have to wait until
|
|
107
|
+
we get the actual cluster name from the server.
|
|
108
|
+
"""
|
|
109
|
+
return (self._cluster_name_from_server if self._cluster_name_from_server
|
|
110
|
+
is not None else self._cluster_name_func())
|
|
111
|
+
|
|
112
|
+
def set_cluster_name_from_server(self, cluster_name: str) -> None:
|
|
113
|
+
self._cluster_name_from_server = cluster_name
|
|
114
|
+
|
|
87
115
|
|
|
88
116
|
# TODO: refactor controller class to not be an enum.
|
|
89
117
|
class Controllers(enum.Enum):
|
|
@@ -93,10 +121,11 @@ class Controllers(enum.Enum):
|
|
|
93
121
|
JOBS_CONTROLLER = _ControllerSpec(
|
|
94
122
|
controller_type='jobs',
|
|
95
123
|
name='managed jobs controller',
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
124
|
+
_cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
|
|
125
|
+
_cluster_name_from_server=None,
|
|
126
|
+
in_progress_hint=lambda _:
|
|
127
|
+
('* {job_info}To see all managed jobs: '
|
|
128
|
+
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
100
129
|
decline_cancel_hint=(
|
|
101
130
|
'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
|
|
102
131
|
f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
|
|
@@ -124,10 +153,14 @@ class Controllers(enum.Enum):
|
|
|
124
153
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
|
125
154
|
controller_type='serve',
|
|
126
155
|
name='serve controller',
|
|
127
|
-
|
|
156
|
+
_cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
|
|
157
|
+
_cluster_name_from_server=None,
|
|
128
158
|
in_progress_hint=(
|
|
129
|
-
|
|
130
|
-
f'
|
|
159
|
+
lambda pool:
|
|
160
|
+
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
161
|
+
f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
|
|
162
|
+
(f'* To see detailed service status: {colorama.Style.BRIGHT}'
|
|
163
|
+
f'sky serve status -v{colorama.Style.RESET_ALL}')),
|
|
131
164
|
decline_cancel_hint=(
|
|
132
165
|
'Cancelling the sky serve controller\'s jobs is not allowed.'),
|
|
133
166
|
_decline_down_when_failed_to_fetch_status_hint=(
|
|
@@ -154,7 +187,9 @@ class Controllers(enum.Enum):
|
|
|
154
187
|
default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
|
|
155
188
|
|
|
156
189
|
@classmethod
|
|
157
|
-
def from_name(cls,
|
|
190
|
+
def from_name(cls,
|
|
191
|
+
name: Optional[str],
|
|
192
|
+
expect_exact_match: bool = True) -> Optional['Controllers']:
|
|
158
193
|
"""Check if the cluster name is a controller name.
|
|
159
194
|
|
|
160
195
|
Returns:
|
|
@@ -168,15 +203,32 @@ class Controllers(enum.Enum):
|
|
|
168
203
|
# we may not know the exact name, because we are missing the server-side
|
|
169
204
|
# common.SERVER_ID. So, we will assume anything that matches the prefix
|
|
170
205
|
# is a controller.
|
|
206
|
+
prefix = None
|
|
171
207
|
if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
|
|
172
208
|
controller = cls.SKY_SERVE_CONTROLLER
|
|
209
|
+
prefix = common.SKY_SERVE_CONTROLLER_PREFIX
|
|
173
210
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
174
211
|
controller = cls.JOBS_CONTROLLER
|
|
175
|
-
|
|
212
|
+
prefix = common.JOB_CONTROLLER_PREFIX
|
|
213
|
+
|
|
214
|
+
if controller is not None and expect_exact_match:
|
|
215
|
+
assert name == controller.value.cluster_name, (
|
|
216
|
+
name, controller.value.cluster_name)
|
|
217
|
+
elif controller is not None and name != controller.value.cluster_name:
|
|
176
218
|
# The client-side cluster_name is not accurate. Assume that `name`
|
|
177
219
|
# is the actual cluster name, so need to set the controller's
|
|
178
220
|
# cluster name to the input name.
|
|
179
|
-
|
|
221
|
+
|
|
222
|
+
# Assert that the cluster name is well-formed. It should be
|
|
223
|
+
# {prefix}{hash}, where prefix is set above, and hash is a valid
|
|
224
|
+
# user hash.
|
|
225
|
+
assert prefix is not None, prefix
|
|
226
|
+
assert name.startswith(prefix), name
|
|
227
|
+
assert common_utils.is_valid_user_hash(name[len(prefix):]), (name,
|
|
228
|
+
prefix)
|
|
229
|
+
|
|
230
|
+
# Update the cluster name.
|
|
231
|
+
controller.value.set_cluster_name_from_server(name)
|
|
180
232
|
return controller
|
|
181
233
|
|
|
182
234
|
@classmethod
|
|
@@ -193,27 +245,35 @@ class Controllers(enum.Enum):
|
|
|
193
245
|
return None
|
|
194
246
|
|
|
195
247
|
|
|
196
|
-
def
|
|
197
|
-
|
|
248
|
+
def get_controller_for_pool(pool: bool) -> Controllers:
|
|
249
|
+
"""Get the controller type."""
|
|
250
|
+
if pool:
|
|
251
|
+
return Controllers.JOBS_CONTROLLER
|
|
252
|
+
return Controllers.SKY_SERVE_CONTROLLER
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
198
256
|
"""Check if the controller high availability is specified in user config.
|
|
199
257
|
"""
|
|
200
|
-
controller = Controllers.from_name(cluster_name)
|
|
258
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
201
259
|
if controller is None:
|
|
202
260
|
return False
|
|
203
261
|
|
|
262
|
+
if controller.value.controller_type == 'jobs':
|
|
263
|
+
# pylint: disable-next=import-outside-toplevel
|
|
264
|
+
from sky.jobs import utils as managed_job_utils
|
|
265
|
+
if managed_job_utils.is_consolidation_mode():
|
|
266
|
+
return True
|
|
267
|
+
elif controller.value.controller_type == 'serve':
|
|
268
|
+
# pylint: disable-next=import-outside-toplevel
|
|
269
|
+
from sky.serve import serve_utils
|
|
270
|
+
if serve_utils.is_consolidation_mode():
|
|
271
|
+
return True
|
|
272
|
+
|
|
204
273
|
if skypilot_config.loaded():
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
if high_availability:
|
|
209
|
-
if controller.value.controller_type != 'serve':
|
|
210
|
-
if not skip_warning:
|
|
211
|
-
print(f'{colorama.Fore.RED}High availability controller is'
|
|
212
|
-
'only supported for SkyServe controller. It cannot'
|
|
213
|
-
f'be enabled for {controller.value.name}.'
|
|
214
|
-
f'Skipping this flag.{colorama.Style.RESET_ALL}')
|
|
215
|
-
else:
|
|
216
|
-
return True
|
|
274
|
+
return skypilot_config.get_nested((controller.value.controller_type,
|
|
275
|
+
'controller', 'high_availability'),
|
|
276
|
+
False)
|
|
217
277
|
return False
|
|
218
278
|
|
|
219
279
|
|
|
@@ -250,6 +310,13 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
250
310
|
sky_check.get_cached_enabled_clouds_or_refresh(
|
|
251
311
|
sky_cloud.CloudCapability.STORAGE))
|
|
252
312
|
enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
|
|
313
|
+
enabled_k8s_and_ssh = [
|
|
314
|
+
repr(cloud)
|
|
315
|
+
for cloud in enabled_clouds
|
|
316
|
+
if isinstance(cloud, clouds.Kubernetes)
|
|
317
|
+
]
|
|
318
|
+
k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
|
|
319
|
+
k8s_dependencies_installed = False
|
|
253
320
|
|
|
254
321
|
for cloud in enabled_clouds:
|
|
255
322
|
cloud_python_dependencies: List[str] = copy.deepcopy(
|
|
@@ -269,10 +336,33 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
269
336
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
270
337
|
commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
|
|
271
338
|
f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
|
|
272
|
-
|
|
339
|
+
if clouds.cloud_in_iterable(clouds.Kubernetes(), enabled_clouds):
|
|
340
|
+
# Install gke-gcloud-auth-plugin used for exec-auth with GKE.
|
|
341
|
+
# We install the plugin here instead of the next elif branch
|
|
342
|
+
# because gcloud is required to install the plugin, so the order
|
|
343
|
+
# of command execution is critical.
|
|
344
|
+
|
|
345
|
+
# We install plugin here regardless of whether exec-auth is
|
|
346
|
+
# actually used as exec-auth may be used in the future.
|
|
347
|
+
# TODO (kyuds): how to implement conservative installation?
|
|
348
|
+
commands.append(
|
|
349
|
+
'(command -v gke-gcloud-auth-plugin &>/dev/null || '
|
|
350
|
+
'(gcloud components install gke-gcloud-auth-plugin --quiet &>/dev/null))') # pylint: disable=line-too-long
|
|
351
|
+
elif isinstance(cloud, clouds.Nebius):
|
|
273
352
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
274
353
|
commands.append(
|
|
275
|
-
f'echo -en "\\r{step_prefix}
|
|
354
|
+
f'echo -en "\\r{step_prefix}Nebius{empty_str}" && '
|
|
355
|
+
'curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh ' # pylint: disable=line-too-long
|
|
356
|
+
'| sudo NEBIUS_INSTALL_FOLDER=/usr/local/bin bash &> /dev/null && '
|
|
357
|
+
'nebius profile create --profile sky '
|
|
358
|
+
'--endpoint api.nebius.cloud '
|
|
359
|
+
'--service-account-file $HOME/.nebius/credentials.json '
|
|
360
|
+
'&> /dev/null || echo "Unable to create Nebius profile."')
|
|
361
|
+
elif (isinstance(cloud, clouds.Kubernetes) and
|
|
362
|
+
not k8s_dependencies_installed):
|
|
363
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
364
|
+
commands.append(
|
|
365
|
+
f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
|
|
276
366
|
# Install k8s + skypilot dependencies
|
|
277
367
|
'sudo bash -c "if '
|
|
278
368
|
'! command -v curl &> /dev/null || '
|
|
@@ -292,7 +382,10 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
292
382
|
'(curl -s -LO "https://dl.k8s.io/release/v1.31.6'
|
|
293
383
|
'/bin/linux/$ARCH/kubectl" && '
|
|
294
384
|
'sudo install -o root -g root -m 0755 '
|
|
295
|
-
'kubectl /usr/local/bin/kubectl))'
|
|
385
|
+
'kubectl /usr/local/bin/kubectl)) && '
|
|
386
|
+
f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
|
|
387
|
+
f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
|
|
388
|
+
k8s_dependencies_installed = True
|
|
296
389
|
elif isinstance(cloud, clouds.Cudo):
|
|
297
390
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
298
391
|
commands.append(
|
|
@@ -345,7 +438,7 @@ def check_cluster_name_not_controller(
|
|
|
345
438
|
Returns:
|
|
346
439
|
None, if the cluster name is not a controller name.
|
|
347
440
|
"""
|
|
348
|
-
controller = Controllers.from_name(cluster_name)
|
|
441
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
349
442
|
if controller is not None:
|
|
350
443
|
msg = controller.value.check_cluster_name_hint
|
|
351
444
|
if operation_str is not None:
|
|
@@ -355,10 +448,11 @@ def check_cluster_name_not_controller(
|
|
|
355
448
|
|
|
356
449
|
|
|
357
450
|
# Internal only:
|
|
358
|
-
def
|
|
451
|
+
def download_and_stream_job_log(
|
|
359
452
|
backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
|
|
360
453
|
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
361
|
-
local_dir: str
|
|
454
|
+
local_dir: str,
|
|
455
|
+
job_ids: Optional[List[str]] = None) -> Optional[str]:
|
|
362
456
|
"""Downloads and streams the latest job log.
|
|
363
457
|
|
|
364
458
|
This function is only used by jobs controller and sky serve controller.
|
|
@@ -376,7 +470,7 @@ def download_and_stream_latest_job_log(
|
|
|
376
470
|
# multi-node cluster is preempted, and we recover the managed job
|
|
377
471
|
# on the existing cluster, which leads to a larger job_id. Those
|
|
378
472
|
# job_ids all represent the same logical managed job.
|
|
379
|
-
job_ids=
|
|
473
|
+
job_ids=job_ids,
|
|
380
474
|
local_dir=local_dir)
|
|
381
475
|
except Exception as e: # pylint: disable=broad-except
|
|
382
476
|
# We want to avoid crashing the controller. sync_down_logs() is pretty
|
|
@@ -394,7 +488,7 @@ def download_and_stream_latest_job_log(
|
|
|
394
488
|
return None
|
|
395
489
|
|
|
396
490
|
log_dir = list(log_dirs.values())[0]
|
|
397
|
-
log_file = os.path.join(log_dir, 'run.log')
|
|
491
|
+
log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
|
|
398
492
|
|
|
399
493
|
# Print the logs to the console.
|
|
400
494
|
# TODO(zhwu): refactor this into log_utils, along with the refactoring for
|
|
@@ -439,10 +533,13 @@ def shared_controller_vars_to_fill(
|
|
|
439
533
|
# before popping allowed_contexts. If it is not on Kubernetes,
|
|
440
534
|
# we may be able to use allowed_contexts.
|
|
441
535
|
local_user_config.pop('allowed_contexts', None)
|
|
536
|
+
# Remove api_server config so that the controller does not try to use
|
|
537
|
+
# a remote API server.
|
|
538
|
+
local_user_config.pop('api_server', None)
|
|
442
539
|
with tempfile.NamedTemporaryFile(
|
|
443
540
|
delete=False,
|
|
444
541
|
suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
|
|
445
|
-
|
|
542
|
+
yaml_utils.dump_yaml(temp_file.name, dict(**local_user_config))
|
|
446
543
|
local_user_config_path = temp_file.name
|
|
447
544
|
|
|
448
545
|
vars_to_fill: Dict[str, Any] = {
|
|
@@ -461,7 +558,7 @@ def shared_controller_vars_to_fill(
|
|
|
461
558
|
env_vars.update({
|
|
462
559
|
# Should not use $USER here, as that env var can be empty when
|
|
463
560
|
# running in a container.
|
|
464
|
-
constants.USER_ENV_VAR:
|
|
561
|
+
constants.USER_ENV_VAR: common_utils.get_current_user_name(),
|
|
465
562
|
constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
|
|
466
563
|
# Skip cloud identity check to avoid the overhead.
|
|
467
564
|
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
|
|
@@ -472,7 +569,15 @@ def shared_controller_vars_to_fill(
|
|
|
472
569
|
# with a remote API server.
|
|
473
570
|
constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
|
|
474
571
|
common_utils.get_using_remote_api_server()),
|
|
572
|
+
constants.IS_SKYPILOT_SERVE_CONTROLLER:
|
|
573
|
+
('true'
|
|
574
|
+
if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
|
|
475
575
|
})
|
|
576
|
+
override_concurrent_launches = os.environ.get(
|
|
577
|
+
constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
|
|
578
|
+
if override_concurrent_launches is not None:
|
|
579
|
+
env_vars[constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES] = str(
|
|
580
|
+
int(override_concurrent_launches))
|
|
476
581
|
if skypilot_config.loaded():
|
|
477
582
|
# Only set the SKYPILOT_CONFIG env var if the user has a config file.
|
|
478
583
|
env_vars[
|
|
@@ -504,6 +609,30 @@ def get_controller_resources(
|
|
|
504
609
|
if custom_controller_resources_config is not None:
|
|
505
610
|
controller_resources_config_copied.update(
|
|
506
611
|
custom_controller_resources_config)
|
|
612
|
+
# Compatibility with the old way of specifying the controller autostop
|
|
613
|
+
# config. TODO(cooperc): Remove this before 0.12.0.
|
|
614
|
+
custom_controller_autostop_config = skypilot_config.get_nested(
|
|
615
|
+
(controller.value.controller_type, 'controller', 'autostop'), None)
|
|
616
|
+
if custom_controller_autostop_config is not None:
|
|
617
|
+
logger.warning(
|
|
618
|
+
f'{colorama.Fore.YELLOW}Warning: Config value '
|
|
619
|
+
f'`{controller.value.controller_type}.controller.autostop` '
|
|
620
|
+
'is deprecated. Please use '
|
|
621
|
+
f'`{controller.value.controller_type}.controller.resources.'
|
|
622
|
+
f'autostop` instead.{colorama.Style.RESET_ALL}')
|
|
623
|
+
# Only set the autostop config if it is not already specified.
|
|
624
|
+
if controller_resources_config_copied.get('autostop') is None:
|
|
625
|
+
controller_resources_config_copied['autostop'] = (
|
|
626
|
+
custom_controller_autostop_config)
|
|
627
|
+
else:
|
|
628
|
+
logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
|
|
629
|
+
'config, since it is already specified in '
|
|
630
|
+
f'resources.{colorama.Style.RESET_ALL}')
|
|
631
|
+
# Set the default autostop config for the controller, if not already
|
|
632
|
+
# specified.
|
|
633
|
+
if controller_resources_config_copied.get('autostop') is None:
|
|
634
|
+
controller_resources_config_copied['autostop'] = (
|
|
635
|
+
controller.value.default_autostop_config)
|
|
507
636
|
|
|
508
637
|
try:
|
|
509
638
|
controller_resources = resources.Resources.from_yaml_config(
|
|
@@ -529,12 +658,16 @@ def get_controller_resources(
|
|
|
529
658
|
controller_resources_to_use: resources.Resources = list(
|
|
530
659
|
controller_resources)[0]
|
|
531
660
|
|
|
532
|
-
|
|
661
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
533
662
|
controller.value.cluster_name)
|
|
534
|
-
if
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
663
|
+
if controller_handle is not None:
|
|
664
|
+
if controller_handle is not None:
|
|
665
|
+
# Use the existing resources, but override the autostop config with
|
|
666
|
+
# the one currently specified in the config.
|
|
667
|
+
controller_resources_to_use = (
|
|
668
|
+
controller_handle.launched_resources.copy(
|
|
669
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
670
|
+
)
|
|
538
671
|
|
|
539
672
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
540
673
|
# it should provide better connectivity. We will let the controller choose
|
|
@@ -595,8 +728,9 @@ def get_controller_resources(
|
|
|
595
728
|
controller_zone = controller_resources_to_use.zone
|
|
596
729
|
|
|
597
730
|
# Filter clouds if controller_resources_to_use.cloud is specified.
|
|
598
|
-
filtered_clouds =
|
|
599
|
-
|
|
731
|
+
filtered_clouds: Set[str] = {controller_cloud
|
|
732
|
+
} if controller_cloud is not None else set(
|
|
733
|
+
requested_clouds_with_region_zone.keys())
|
|
600
734
|
|
|
601
735
|
# Filter regions and zones and construct the result.
|
|
602
736
|
result: Set[resources.Resources] = set()
|
|
@@ -605,15 +739,17 @@ def get_controller_resources(
|
|
|
605
739
|
{None: {None}})
|
|
606
740
|
|
|
607
741
|
# Filter regions if controller_resources_to_use.region is specified.
|
|
608
|
-
filtered_regions = ({
|
|
609
|
-
|
|
742
|
+
filtered_regions: Set[Optional[str]] = ({
|
|
743
|
+
controller_region
|
|
744
|
+
} if controller_region is not None else set(regions.keys()))
|
|
610
745
|
|
|
611
746
|
for region in filtered_regions:
|
|
612
747
|
zones = regions.get(region, {None})
|
|
613
748
|
|
|
614
749
|
# Filter zones if controller_resources_to_use.zone is specified.
|
|
615
|
-
filtered_zones = ({
|
|
616
|
-
|
|
750
|
+
filtered_zones: Set[Optional[str]] = ({
|
|
751
|
+
controller_zone
|
|
752
|
+
} if controller_zone is not None else set(zones))
|
|
617
753
|
|
|
618
754
|
# Create combinations of cloud, region, and zone.
|
|
619
755
|
for zone in filtered_zones:
|
|
@@ -628,38 +764,15 @@ def get_controller_resources(
|
|
|
628
764
|
return result
|
|
629
765
|
|
|
630
766
|
|
|
631
|
-
def
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
controller.value.default_autostop_config)
|
|
641
|
-
if skypilot_config.loaded():
|
|
642
|
-
custom_controller_autostop_config = skypilot_config.get_nested(
|
|
643
|
-
(controller.value.controller_type, 'controller', 'autostop'), None)
|
|
644
|
-
if custom_controller_autostop_config is False:
|
|
645
|
-
# Disabled with `autostop: false` in config.
|
|
646
|
-
# To indicate autostop is disabled, we return None for
|
|
647
|
-
# idle_minutes_to_autostop.
|
|
648
|
-
return None, False
|
|
649
|
-
elif custom_controller_autostop_config is True:
|
|
650
|
-
# Enabled with default values. There is no change in behavior, but
|
|
651
|
-
# this is included by for completeness, since `False` is valid.
|
|
652
|
-
pass
|
|
653
|
-
elif custom_controller_autostop_config is not None:
|
|
654
|
-
# We have specific config values.
|
|
655
|
-
# Override the controller autostop config with the ones specified in
|
|
656
|
-
# the config.
|
|
657
|
-
assert isinstance(custom_controller_autostop_config, dict)
|
|
658
|
-
controller_autostop_config_copied.update(
|
|
659
|
-
custom_controller_autostop_config)
|
|
660
|
-
|
|
661
|
-
return (controller_autostop_config_copied['idle_minutes'],
|
|
662
|
-
controller_autostop_config_copied['down'])
|
|
767
|
+
def get_controller_mem_size_gb() -> float:
|
|
768
|
+
try:
|
|
769
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
770
|
+
'r',
|
|
771
|
+
encoding='utf-8') as f:
|
|
772
|
+
return float(f.read())
|
|
773
|
+
except FileNotFoundError:
|
|
774
|
+
pass
|
|
775
|
+
return common_utils.get_mem_size_gb()
|
|
663
776
|
|
|
664
777
|
|
|
665
778
|
def _setup_proxy_command_on_controller(
|
|
@@ -690,7 +803,7 @@ def _setup_proxy_command_on_controller(
|
|
|
690
803
|
# NOTE: suppose that we have a controller in old VPC, then user
|
|
691
804
|
# changes 'vpc_name' in the config and does a 'job launch' /
|
|
692
805
|
# 'serve up'. In general, the old controller may not successfully
|
|
693
|
-
# launch the job in the new VPC. This happens if the two VPCs don
|
|
806
|
+
# launch the job in the new VPC. This happens if the two VPCs don't
|
|
694
807
|
# have peering set up. Like other places in the code, we assume
|
|
695
808
|
# properly setting up networking is user's responsibilities.
|
|
696
809
|
# TODO(zongheng): consider adding a basic check that checks
|
|
@@ -701,7 +814,11 @@ def _setup_proxy_command_on_controller(
|
|
|
701
814
|
config = config_utils.Config.from_dict(user_config)
|
|
702
815
|
proxy_command_key = (str(controller_launched_cloud).lower(),
|
|
703
816
|
'ssh_proxy_command')
|
|
704
|
-
ssh_proxy_command =
|
|
817
|
+
ssh_proxy_command = skypilot_config.get_effective_region_config(
|
|
818
|
+
cloud=str(controller_launched_cloud).lower(),
|
|
819
|
+
region=None,
|
|
820
|
+
keys=('ssh_proxy_command',),
|
|
821
|
+
default_value=None)
|
|
705
822
|
if isinstance(ssh_proxy_command, str):
|
|
706
823
|
config.set_nested(proxy_command_key, None)
|
|
707
824
|
elif isinstance(ssh_proxy_command, dict):
|
|
@@ -731,9 +848,9 @@ def replace_skypilot_config_path_in_file_mounts(
|
|
|
731
848
|
continue
|
|
732
849
|
if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
|
|
733
850
|
with tempfile.NamedTemporaryFile('w', delete=False) as f:
|
|
734
|
-
user_config =
|
|
851
|
+
user_config = yaml_utils.read_yaml(local_path)
|
|
735
852
|
config = _setup_proxy_command_on_controller(cloud, user_config)
|
|
736
|
-
|
|
853
|
+
yaml_utils.dump_yaml(f.name, dict(**config))
|
|
737
854
|
file_mounts[remote_path] = f.name
|
|
738
855
|
replaced = True
|
|
739
856
|
if replaced:
|
|
@@ -776,7 +893,7 @@ def translate_local_file_mounts_to_two_hop(
|
|
|
776
893
|
file_mount_id = 0
|
|
777
894
|
|
|
778
895
|
file_mounts_to_translate = task.file_mounts or {}
|
|
779
|
-
if task.workdir is not None:
|
|
896
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
|
780
897
|
file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
|
|
781
898
|
task.workdir = None
|
|
782
899
|
|
|
@@ -844,7 +961,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
844
961
|
copy_mounts = {}
|
|
845
962
|
|
|
846
963
|
has_local_source_paths_file_mounts = bool(copy_mounts)
|
|
847
|
-
has_local_source_paths_workdir = task.workdir is not None
|
|
964
|
+
has_local_source_paths_workdir = (task.workdir is not None and
|
|
965
|
+
isinstance(task.workdir, str))
|
|
848
966
|
|
|
849
967
|
msg = None
|
|
850
968
|
if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
|
|
@@ -892,7 +1010,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
892
1010
|
|
|
893
1011
|
# Step 1: Translate the workdir to SkyPilot storage.
|
|
894
1012
|
new_storage_mounts = {}
|
|
895
|
-
if task.workdir is not None:
|
|
1013
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
|
896
1014
|
workdir = task.workdir
|
|
897
1015
|
task.workdir = None
|
|
898
1016
|
if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
|
|
@@ -1113,3 +1231,179 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
1113
1231
|
task.update_storage_mounts(updated_mount_storages)
|
|
1114
1232
|
if msg:
|
|
1115
1233
|
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
# ======================= Resources Management Functions =======================
|
|
1237
|
+
|
|
1238
|
+
# Monitoring process for service is 512MB. This is based on an old
|
|
1239
|
+
# estimation but we keep it here for now.
|
|
1240
|
+
# TODO(tian): Remeasure this.
|
|
1241
|
+
SERVE_MONITORING_MEMORY_MB = 512
|
|
1242
|
+
# The resource consumption ratio of service launch to serve down.
|
|
1243
|
+
SERVE_LAUNCH_RATIO = 2.0
|
|
1244
|
+
|
|
1245
|
+
# The _RESOURCES_LOCK should be held whenever we are checking the parallelism
|
|
1246
|
+
# control or updating the schedule_state of any job or service. Any code that
|
|
1247
|
+
# takes this lock must conclude by calling maybe_schedule_next_jobs.
|
|
1248
|
+
_RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
|
|
1249
|
+
|
|
1250
|
+
# keep 2GB reserved after the controllers
|
|
1251
|
+
MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
|
|
1252
|
+
|
|
1253
|
+
# NOTE: In the current implementation, we only consider the memory
|
|
1254
|
+
# The ratio of resources consumption for managed jobs and pool/serve.
|
|
1255
|
+
# This measures pool_resources / jobs_resources. If 2 GB memory is allocated to
|
|
1256
|
+
# jobs, then 2 * POOL_JOBS_RESOURCES_RATIO GB memory is allocated to pool/serve.
|
|
1257
|
+
POOL_JOBS_RESOURCES_RATIO = 1
|
|
1258
|
+
# Number of ongoing launches launches allowed per worker. Can probably be
|
|
1259
|
+
# increased a bit to around 16 but keeping it lower to just to be safe
|
|
1260
|
+
LAUNCHES_PER_WORKER = 8
|
|
1261
|
+
# Number of ongoing launches allowed per service. Can probably be increased
|
|
1262
|
+
# a bit as well.
|
|
1263
|
+
LAUNCHES_PER_SERVICE = 4
|
|
1264
|
+
|
|
1265
|
+
# Based on testing, each worker takes around 200-300MB memory. Keeping it
|
|
1266
|
+
# higher to be safe.
|
|
1267
|
+
JOB_WORKER_MEMORY_MB = 400
|
|
1268
|
+
# this can probably be increased to around 300-400 but keeping it lower to just
|
|
1269
|
+
# to be safe
|
|
1270
|
+
MAX_JOBS_PER_WORKER = 200
|
|
1271
|
+
# Maximum number of controllers that can be running. Hard to handle more than
|
|
1272
|
+
# 512 launches at once.
|
|
1273
|
+
MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
|
|
1274
|
+
# Limit the number of jobs that can be running at once on the entire jobs
|
|
1275
|
+
# controller cluster. It's hard to handle cancellation of more than 2000 jobs at
|
|
1276
|
+
# once.
|
|
1277
|
+
# TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
|
|
1278
|
+
# hardcoded max limit.
|
|
1279
|
+
MAX_TOTAL_RUNNING_JOBS = 2000
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def compute_memory_reserved_for_controllers(
|
|
1283
|
+
reserve_for_controllers: bool, reserve_extra_for_pool: bool) -> float:
|
|
1284
|
+
reserved_memory_mb = 0.0
|
|
1285
|
+
if reserve_for_controllers:
|
|
1286
|
+
reserved_memory_mb = float(MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB)
|
|
1287
|
+
if reserve_extra_for_pool:
|
|
1288
|
+
reserved_memory_mb *= (1. + POOL_JOBS_RESOURCES_RATIO)
|
|
1289
|
+
return reserved_memory_mb
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _get_total_usable_memory_mb(pool: bool, consolidation_mode: bool) -> float:
|
|
1293
|
+
controller_reserved = compute_memory_reserved_for_controllers(
|
|
1294
|
+
reserve_for_controllers=True, reserve_extra_for_pool=pool)
|
|
1295
|
+
total_memory_mb = (common_utils.get_mem_size_gb() * 1024 -
|
|
1296
|
+
controller_reserved)
|
|
1297
|
+
if not consolidation_mode:
|
|
1298
|
+
return total_memory_mb
|
|
1299
|
+
config = server_config.compute_server_config(
|
|
1300
|
+
deploy=True, quiet=True, reserved_memory_mb=controller_reserved)
|
|
1301
|
+
used = 0.0
|
|
1302
|
+
used += ((config.long_worker_config.garanteed_parallelism +
|
|
1303
|
+
config.long_worker_config.burstable_parallelism) *
|
|
1304
|
+
server_config.LONG_WORKER_MEM_GB * 1024)
|
|
1305
|
+
used += ((config.short_worker_config.garanteed_parallelism +
|
|
1306
|
+
config.short_worker_config.burstable_parallelism) *
|
|
1307
|
+
server_config.SHORT_WORKER_MEM_GB * 1024)
|
|
1308
|
+
return total_memory_mb - used
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
def _is_consolidation_mode(pool: bool) -> bool:
|
|
1312
|
+
return skypilot_config.get_nested(
|
|
1313
|
+
('jobs' if pool else 'serve', 'controller', 'consolidation_mode'),
|
|
1314
|
+
default_value=False)
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
@annotations.lru_cache(scope='request')
|
|
1318
|
+
def _get_parallelism(pool: bool, raw_resource_per_unit: float) -> int:
|
|
1319
|
+
"""Returns the number of jobs controllers / services that should be running.
|
|
1320
|
+
|
|
1321
|
+
This is the number of controllers / services that should be running
|
|
1322
|
+
to maximize resource utilization.
|
|
1323
|
+
|
|
1324
|
+
In consolidation mode, we use the existing API server so our resource
|
|
1325
|
+
requirements are just for the job controllers / services. We try taking
|
|
1326
|
+
up as much memory as possible left over from the API server.
|
|
1327
|
+
|
|
1328
|
+
In non-consolidation mode, we have to take into account the memory of the
|
|
1329
|
+
API server workers. We limit to only 8 launches per worker, so our logic is
|
|
1330
|
+
each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
|
|
1331
|
+
leave some leftover room for ssh codegen and ray status overhead.
|
|
1332
|
+
"""
|
|
1333
|
+
consolidation_mode = _is_consolidation_mode(pool)
|
|
1334
|
+
|
|
1335
|
+
total_memory_mb = _get_total_usable_memory_mb(pool, consolidation_mode)
|
|
1336
|
+
|
|
1337
|
+
# In consolidation mode, we assume the API server is running in deployment
|
|
1338
|
+
# mode, hence resource management (i.e. how many requests are allowed) is
|
|
1339
|
+
# done by the API server.
|
|
1340
|
+
resource_per_unit_worker = 0.
|
|
1341
|
+
# Otherwise, it runs a local API server on the jobs/serve controller.
|
|
1342
|
+
# We need to do the resource management ourselves.
|
|
1343
|
+
if not consolidation_mode:
|
|
1344
|
+
launches_per_worker = (LAUNCHES_PER_WORKER
|
|
1345
|
+
if pool else LAUNCHES_PER_SERVICE)
|
|
1346
|
+
resource_per_unit_worker = (launches_per_worker *
|
|
1347
|
+
server_config.LONG_WORKER_MEM_GB * 1024)
|
|
1348
|
+
|
|
1349
|
+
# If running pool on jobs controller, we need to account for the resources
|
|
1350
|
+
# consumed by the jobs.
|
|
1351
|
+
ratio = (1. + POOL_JOBS_RESOURCES_RATIO) if pool else 1.
|
|
1352
|
+
resource_per_unit = ratio * (raw_resource_per_unit +
|
|
1353
|
+
resource_per_unit_worker)
|
|
1354
|
+
|
|
1355
|
+
return max(int(total_memory_mb / resource_per_unit), 1)
|
|
1356
|
+
|
|
1357
|
+
|
|
1358
|
+
def get_number_of_jobs_controllers() -> int:
|
|
1359
|
+
return min(
|
|
1360
|
+
MAX_CONTROLLERS,
|
|
1361
|
+
_get_parallelism(pool=True, raw_resource_per_unit=JOB_WORKER_MEMORY_MB))
|
|
1362
|
+
|
|
1363
|
+
|
|
1364
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
1365
|
+
def get_resources_lock_path() -> str:
|
|
1366
|
+
path = os.path.expanduser(_RESOURCES_LOCK)
|
|
1367
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
1368
|
+
return path
|
|
1369
|
+
|
|
1370
|
+
|
|
1371
|
+
def _get_number_of_services(pool: bool) -> int:
|
|
1372
|
+
return _get_parallelism(pool=pool,
|
|
1373
|
+
raw_resource_per_unit=SERVE_MONITORING_MEMORY_MB *
|
|
1374
|
+
POOL_JOBS_RESOURCES_RATIO)
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
@annotations.lru_cache(scope='request')
|
|
1378
|
+
def _get_request_parallelism(pool: bool) -> int:
|
|
1379
|
+
# NOTE(dev): One smoke test depends on this value.
|
|
1380
|
+
# tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
|
|
1381
|
+
# assumes 4 concurrent launches.
|
|
1382
|
+
override_concurrent_launches = os.environ.get(
|
|
1383
|
+
constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
|
|
1384
|
+
if override_concurrent_launches is not None and not pool:
|
|
1385
|
+
return int(override_concurrent_launches)
|
|
1386
|
+
# Limitation per service x number of services
|
|
1387
|
+
launches_per_worker = (LAUNCHES_PER_WORKER
|
|
1388
|
+
if pool else LAUNCHES_PER_SERVICE)
|
|
1389
|
+
return (launches_per_worker * POOL_JOBS_RESOURCES_RATIO *
|
|
1390
|
+
_get_number_of_services(pool))
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
def can_provision(pool: bool) -> bool:
|
|
1394
|
+
# TODO(tian): probe API server to see if there is any pending provision
|
|
1395
|
+
# requests.
|
|
1396
|
+
return can_terminate(pool)
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def can_start_new_process(pool: bool) -> bool:
|
|
1400
|
+
return serve_state.get_num_services() < _get_number_of_services(pool)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def can_terminate(pool: bool) -> bool:
|
|
1404
|
+
# TODO(tian): probe API server to see if there is any pending terminate
|
|
1405
|
+
# requests.
|
|
1406
|
+
num_terminating = (
|
|
1407
|
+
serve_state.total_number_provisioning_replicas() +
|
|
1408
|
+
serve_state.total_number_terminating_replicas() / SERVE_LAUNCH_RATIO)
|
|
1409
|
+
return num_terminating < _get_request_parallelism(pool)
|