skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -6,6 +6,23 @@ from packaging import version
|
|
|
6
6
|
import sky
|
|
7
7
|
from sky.setup_files import dependencies
|
|
8
8
|
|
|
9
|
+
# The base directory for all SkyPilot runtime artifacts.
|
|
10
|
+
# Historically, we have always used $HOME, but we couldn't
|
|
11
|
+
# do that for Slurm, because $HOME typically points to a NFS
|
|
12
|
+
# mounted directory, which does not work well with SQLite.
|
|
13
|
+
# https://sqlite.org/faq.html#q5
|
|
14
|
+
# Additionally, having the skypilot-runtime python venv be
|
|
15
|
+
# on an NFS makes things very slow.
|
|
16
|
+
SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
|
|
17
|
+
# Same as above but for use within python code instead of shell commands.
|
|
18
|
+
# Example usage:
|
|
19
|
+
# os.path.join(
|
|
20
|
+
# os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
|
|
21
|
+
# '.sky/jobs.db')
|
|
22
|
+
SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
23
|
+
# We keep sky_logs and sky_workdir in $HOME, because
|
|
24
|
+
# these are artifacts that users can access, and having
|
|
25
|
+
# them be in $HOME makes it more convenient.
|
|
9
26
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
|
10
27
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
|
11
28
|
SKY_IGNORE_FILE = '.skyignore'
|
|
@@ -24,22 +41,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
|
|
|
24
41
|
f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
|
|
25
42
|
# The file contains the ports of the Ray cluster that SkyPilot launched,
|
|
26
43
|
# i.e. the PORT_DICT_STR above.
|
|
27
|
-
SKY_REMOTE_RAY_PORT_FILE = '
|
|
44
|
+
SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
|
|
28
45
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
29
46
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
30
47
|
|
|
48
|
+
SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
|
|
31
49
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
32
50
|
# in this file, so that any future internal commands that need to use python
|
|
33
51
|
# can use this path. This is useful for the case where the user has a custom
|
|
34
52
|
# conda environment as a default environment, which is not the same as the one
|
|
35
53
|
# used for installing SkyPilot runtime (ray and skypilot).
|
|
36
|
-
SKY_PYTHON_PATH_FILE = '
|
|
37
|
-
SKY_RAY_PATH_FILE = '
|
|
54
|
+
SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
|
|
55
|
+
SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
|
|
38
56
|
SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
39
57
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
40
58
|
'which python3')
|
|
41
59
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
42
|
-
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
|
60
|
+
SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
|
|
43
61
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
44
62
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
45
63
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -51,24 +69,33 @@ SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
|
51
69
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
|
52
70
|
# Separate env for SkyPilot runtime dependencies.
|
|
53
71
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
|
54
|
-
SKY_REMOTE_PYTHON_ENV: str = f'
|
|
72
|
+
SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
55
73
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
56
74
|
# uv is used for venv and pip, much faster than python implementations.
|
|
57
75
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
58
|
-
|
|
76
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
77
|
+
# user provided docker image set it to true.
|
|
78
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
79
|
+
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
80
|
+
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
59
81
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
60
82
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
61
83
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
62
84
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
63
85
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
64
|
-
|
|
65
|
-
|
|
86
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
|
|
87
|
+
'--no-project --no-config')
|
|
88
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
89
|
+
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
90
|
+
# not work when conda is used.
|
|
66
91
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
67
92
|
'export PATH='
|
|
68
|
-
f'$(echo $PATH | sed "s|$(echo
|
|
93
|
+
f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
|
|
94
|
+
'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
|
|
69
95
|
|
|
70
96
|
# Prefix for SkyPilot environment variables
|
|
71
97
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
98
|
+
SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
|
|
72
99
|
|
|
73
100
|
# The name for the environment variable that stores the unique ID of the
|
|
74
101
|
# current task. This will stay the same across multiple recoveries of the
|
|
@@ -89,17 +116,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
89
116
|
# cluster yaml is updated.
|
|
90
117
|
#
|
|
91
118
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
92
|
-
SKYLET_VERSION = '
|
|
119
|
+
SKYLET_VERSION = '27'
|
|
93
120
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
94
121
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
95
122
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
96
|
-
SKYLET_LIB_VERSION =
|
|
97
|
-
SKYLET_VERSION_FILE = '
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
123
|
+
SKYLET_LIB_VERSION = 4
|
|
124
|
+
SKYLET_VERSION_FILE = '.sky/skylet_version'
|
|
125
|
+
SKYLET_LOG_FILE = '.sky/skylet.log'
|
|
126
|
+
SKYLET_PID_FILE = '.sky/skylet_pid'
|
|
127
|
+
SKYLET_PORT_FILE = '.sky/skylet_port'
|
|
128
|
+
SKYLET_GRPC_PORT = 46590
|
|
129
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 10
|
|
103
130
|
|
|
104
131
|
# Docker default options
|
|
105
132
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
|
@@ -151,7 +178,7 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
151
178
|
# because for some images, conda is already installed, but not initialized.
|
|
152
179
|
# In this case, we need to initialize conda and set auto_activate_base to
|
|
153
180
|
# true.
|
|
154
|
-
'{ bash Miniconda3-Linux.sh -b; '
|
|
181
|
+
'{ bash Miniconda3-Linux.sh -b || true; '
|
|
155
182
|
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
|
|
156
183
|
# Caller should replace {conda_auto_activate} with either true or false.
|
|
157
184
|
'conda config --set auto_activate_base {conda_auto_activate} && '
|
|
@@ -173,7 +200,7 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
173
200
|
'fi;'
|
|
174
201
|
# Install uv for venv management and pip installation.
|
|
175
202
|
f'{SKY_UV_INSTALL_CMD};'
|
|
176
|
-
# Create a separate
|
|
203
|
+
# Create a separate python environment for SkyPilot dependencies.
|
|
177
204
|
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
|
|
178
205
|
# Do NOT use --system-site-packages here, because if users upgrade any
|
|
179
206
|
# packages in the base env, they interfere with skypilot dependencies.
|
|
@@ -218,7 +245,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
218
245
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
219
246
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
220
247
|
f'|| {RAY_STATUS} || '
|
|
221
|
-
|
|
248
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
249
|
+
# so we need to avoid that specific version.
|
|
250
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
222
251
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
223
252
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
224
253
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -230,9 +259,24 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
230
259
|
'export PATH=$PATH:$HOME/.local/bin; '
|
|
231
260
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
232
261
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
233
|
-
f'{{ {
|
|
262
|
+
f'{{ {SKY_UV_RUN_CMD} '
|
|
234
263
|
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
|
235
264
|
|
|
265
|
+
# Copy SkyPilot templates from the installed wheel to ~/sky_templates.
|
|
266
|
+
# This must run after the skypilot wheel is installed.
|
|
267
|
+
COPY_SKYPILOT_TEMPLATES_COMMANDS = (
|
|
268
|
+
f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
269
|
+
f'{SKY_PYTHON_CMD} -c \''
|
|
270
|
+
'import sky_templates, shutil, os; '
|
|
271
|
+
'src = os.path.dirname(sky_templates.__file__); '
|
|
272
|
+
'dst = os.path.expanduser(\"~/sky_templates\"); '
|
|
273
|
+
'print(f\"Copying templates from {src} to {dst}...\"); '
|
|
274
|
+
'shutil.copytree(src, dst, dirs_exist_ok=True); '
|
|
275
|
+
'print(f\"Templates copied successfully\")\'; '
|
|
276
|
+
# Make scripts executable.
|
|
277
|
+
'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
|
|
278
|
+
'-exec chmod +x {} \\; ')
|
|
279
|
+
|
|
236
280
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
237
281
|
f'{SKY_UV_INSTALL_CMD};'
|
|
238
282
|
f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
|
|
@@ -323,6 +367,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
|
|
323
367
|
# controller_utils.translate_local_file_mounts_to_two_hop().
|
|
324
368
|
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
|
325
369
|
|
|
370
|
+
# For passing in CPU and memory limits to the controller pod when running
|
|
371
|
+
# in k8s. Right now, we only use this for the jobs controller, but we may
|
|
372
|
+
# use this for the serve controller as well in the future.
|
|
373
|
+
# These files are written to disk by the skylet, who reads it from env vars
|
|
374
|
+
# passed by the backend when starting the skylet (start_skylet_on_head_node).
|
|
375
|
+
CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
|
|
376
|
+
CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
|
|
377
|
+
|
|
326
378
|
# Used when an managed jobs are created and
|
|
327
379
|
# files are synced up to the cloud.
|
|
328
380
|
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
|
@@ -346,9 +398,16 @@ API_SERVER_CREATION_LOCK_PATH = '~/.sky/api_server/.creation.lock'
|
|
|
346
398
|
# API server.
|
|
347
399
|
SKY_API_SERVER_URL_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_ENDPOINT'
|
|
348
400
|
|
|
401
|
+
# The name for the environment variable that stores the SkyPilot service
|
|
402
|
+
# account token on client side.
|
|
403
|
+
SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
|
|
404
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SERVICE_ACCOUNT_TOKEN')
|
|
405
|
+
|
|
349
406
|
# SkyPilot environment variables
|
|
350
407
|
SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
|
|
351
408
|
SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
|
|
409
|
+
SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
|
|
410
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
|
|
352
411
|
SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
|
|
353
412
|
SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
|
|
354
413
|
|
|
@@ -358,7 +417,7 @@ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
|
|
|
358
417
|
|
|
359
418
|
RCLONE_CONFIG_DIR = '~/.config/rclone'
|
|
360
419
|
RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf'
|
|
361
|
-
|
|
420
|
+
RCLONE_MOUNT_CACHED_LOG_DIR = '~/.sky/rclone_log'
|
|
362
421
|
RCLONE_CACHE_DIR = '~/.cache/rclone'
|
|
363
422
|
RCLONE_CACHE_REFRESH_INTERVAL = 10
|
|
364
423
|
|
|
@@ -367,15 +426,43 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
|
|
|
367
426
|
OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
368
427
|
('docker', 'run_options'),
|
|
369
428
|
('nvidia_gpus', 'disable_ecc'),
|
|
429
|
+
('ssh', 'custom_metadata'),
|
|
430
|
+
('ssh', 'pod_config'),
|
|
431
|
+
('ssh', 'provision_timeout'),
|
|
432
|
+
('kubernetes', 'custom_metadata'),
|
|
370
433
|
('kubernetes', 'pod_config'),
|
|
371
434
|
('kubernetes', 'provision_timeout'),
|
|
435
|
+
('kubernetes', 'dws'),
|
|
436
|
+
('kubernetes', 'kueue'),
|
|
372
437
|
('gcp', 'managed_instance_group'),
|
|
438
|
+
('gcp', 'enable_gvnic'),
|
|
439
|
+
('gcp', 'enable_gpu_direct'),
|
|
440
|
+
('gcp', 'placement_policy'),
|
|
441
|
+
('active_workspace',),
|
|
373
442
|
]
|
|
374
443
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
375
444
|
# we skip the following keys because they are meant to be client-side configs.
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
445
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
446
|
+
# the API server side.
|
|
447
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
448
|
+
('api_server',),
|
|
449
|
+
('allowed_clouds',),
|
|
450
|
+
('workspaces',),
|
|
451
|
+
('db',),
|
|
452
|
+
('daemons',),
|
|
453
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
454
|
+
# infrastructure supports setting dynamic server side configs.
|
|
455
|
+
# Tests that are affected:
|
|
456
|
+
# - test_managed_jobs_ha_kill_starting
|
|
457
|
+
# - test_managed_jobs_ha_kill_running
|
|
458
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
459
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
460
|
+
# but the configs won't be applied)
|
|
461
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
462
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
463
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
464
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
465
|
+
]
|
|
379
466
|
|
|
380
467
|
# Constants for Azure blob storage
|
|
381
468
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
|
@@ -392,6 +479,12 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
|
|
|
392
479
|
# persistent through PVC. See kubernetes-ray.yml.j2.
|
|
393
480
|
PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
|
|
394
481
|
PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
|
482
|
+
# Signal file to indicate that the controller is recovering from a failure.
|
|
483
|
+
# See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
|
|
484
|
+
PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
|
|
485
|
+
'~/.sky/.controller_recovery_restarting_signal')
|
|
486
|
+
|
|
487
|
+
HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
|
|
395
488
|
|
|
396
489
|
# The placeholder for the local skypilot config path in file mounts for
|
|
397
490
|
# controllers.
|
|
@@ -400,5 +493,103 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
|
|
400
493
|
# Path to the generated cluster config yamls and ssh configs.
|
|
401
494
|
SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
402
495
|
|
|
496
|
+
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
403
497
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
404
498
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
499
|
+
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
500
|
+
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
|
|
501
|
+
|
|
502
|
+
SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
|
|
503
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
|
|
504
|
+
|
|
505
|
+
# Environment variable that is set to 'true' if metrics are enabled.
|
|
506
|
+
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
|
507
|
+
|
|
508
|
+
# If set, overrides the header that we can use to get the user name.
|
|
509
|
+
ENV_VAR_SERVER_AUTH_USER_HEADER = f'{SKYPILOT_ENV_VAR_PREFIX}AUTH_USER_HEADER'
|
|
510
|
+
|
|
511
|
+
# Environment variable that is used as the DB connection string for the
|
|
512
|
+
# skypilot server.
|
|
513
|
+
ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
|
514
|
+
|
|
515
|
+
# Environment variable that is set to 'true' if basic
|
|
516
|
+
# authentication is enabled in the API server.
|
|
517
|
+
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
518
|
+
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
519
|
+
SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
|
|
520
|
+
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
521
|
+
|
|
522
|
+
# Enable debug logging for requests.
|
|
523
|
+
ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
|
|
524
|
+
f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
|
|
525
|
+
|
|
526
|
+
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
527
|
+
|
|
528
|
+
# BEGIN constants used for service catalog.
|
|
529
|
+
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
|
530
|
+
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
|
531
|
+
CATALOG_SCHEMA_VERSION = 'v8'
|
|
532
|
+
CATALOG_DIR = '~/.sky/catalogs'
|
|
533
|
+
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
534
|
+
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
535
|
+
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
536
|
+
'hyperbolic', 'seeweb', 'shadeform')
|
|
537
|
+
# END constants used for service catalog.
|
|
538
|
+
|
|
539
|
+
# The user ID of the SkyPilot system.
|
|
540
|
+
SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
|
|
541
|
+
|
|
542
|
+
# The directory to store the logging configuration.
|
|
543
|
+
LOGGING_CONFIG_DIR = '~/.sky/logging'
|
|
544
|
+
|
|
545
|
+
# Resources constants
|
|
546
|
+
TIME_UNITS = {
|
|
547
|
+
'm': 1,
|
|
548
|
+
'h': 60,
|
|
549
|
+
'd': 24 * 60,
|
|
550
|
+
'w': 7 * 24 * 60,
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
TIME_PATTERN: str = ('^[0-9]+('
|
|
554
|
+
f'{"|".join([unit.lower() for unit in TIME_UNITS])}|'
|
|
555
|
+
f'{"|".join([unit.upper() for unit in TIME_UNITS])}|'
|
|
556
|
+
')?$')
|
|
557
|
+
|
|
558
|
+
MEMORY_SIZE_UNITS = {
|
|
559
|
+
'kb': 2**10,
|
|
560
|
+
'ki': 2**10,
|
|
561
|
+
'mb': 2**20,
|
|
562
|
+
'mi': 2**20,
|
|
563
|
+
'gb': 2**30,
|
|
564
|
+
'gi': 2**30,
|
|
565
|
+
'tb': 2**40,
|
|
566
|
+
'ti': 2**40,
|
|
567
|
+
'pb': 2**50,
|
|
568
|
+
'pi': 2**50,
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
MEMORY_SIZE_PATTERN = (
|
|
572
|
+
'^[0-9]+('
|
|
573
|
+
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
|
|
574
|
+
f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
|
|
575
|
+
f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
|
|
576
|
+
')?$')
|
|
577
|
+
|
|
578
|
+
LAST_USE_TRUNC_LENGTH = 25
|
|
579
|
+
USED_BY_TRUNC_LENGTH = 25
|
|
580
|
+
|
|
581
|
+
MIN_PRIORITY = -1000
|
|
582
|
+
MAX_PRIORITY = 1000
|
|
583
|
+
DEFAULT_PRIORITY = 0
|
|
584
|
+
|
|
585
|
+
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
|
586
|
+
COST_REPORT_DEFAULT_DAYS = 30
|
|
587
|
+
|
|
588
|
+
ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
589
|
+
'DEBUG_LOOP_LAG_THRESHOLD_MS')
|
|
590
|
+
|
|
591
|
+
ARM64_ARCH = 'arm64'
|
|
592
|
+
X86_64_ARCH = 'x86_64'
|
|
593
|
+
|
|
594
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
595
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -7,12 +7,12 @@ import time
|
|
|
7
7
|
import traceback
|
|
8
8
|
|
|
9
9
|
import psutil
|
|
10
|
-
import yaml
|
|
11
10
|
|
|
12
11
|
from sky import clouds
|
|
13
12
|
from sky import sky_logging
|
|
14
13
|
from sky.backends import cloud_vm_ray_backend
|
|
15
|
-
from sky.jobs import
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
15
|
+
from sky.jobs import scheduler
|
|
16
16
|
from sky.jobs import state as managed_job_state
|
|
17
17
|
from sky.jobs import utils as managed_job_utils
|
|
18
18
|
from sky.serve import serve_utils
|
|
@@ -21,9 +21,10 @@ from sky.skylet import constants
|
|
|
21
21
|
from sky.skylet import job_lib
|
|
22
22
|
from sky.usage import usage_lib
|
|
23
23
|
from sky.utils import cluster_utils
|
|
24
|
-
from sky.utils import common_utils
|
|
25
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
26
26
|
from sky.utils import ux_utils
|
|
27
|
+
from sky.utils import yaml_utils
|
|
27
28
|
|
|
28
29
|
# Seconds of sleep between the processing of skylet events.
|
|
29
30
|
EVENT_CHECKING_INTERVAL_SECONDS = 20
|
|
@@ -46,6 +47,9 @@ class SkyletEvent:
|
|
|
46
47
|
EVENT_CHECKING_INTERVAL_SECONDS))
|
|
47
48
|
self._n = 0
|
|
48
49
|
|
|
50
|
+
def start(self):
|
|
51
|
+
pass
|
|
52
|
+
|
|
49
53
|
def run(self):
|
|
50
54
|
self._n = (self._n + 1) % self._event_interval
|
|
51
55
|
if self._n % self._event_interval == 0:
|
|
@@ -74,9 +78,60 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
74
78
|
"""Skylet event for updating and scheduling managed jobs."""
|
|
75
79
|
EVENT_INTERVAL_SECONDS = 300
|
|
76
80
|
|
|
81
|
+
def start(self):
|
|
82
|
+
cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
|
|
83
|
+
if cpus_env_var is not None:
|
|
84
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
|
|
85
|
+
'w',
|
|
86
|
+
encoding='utf-8') as f:
|
|
87
|
+
f.write(cpus_env_var)
|
|
88
|
+
memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
|
89
|
+
if memory_env_var is not None:
|
|
90
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
91
|
+
'w',
|
|
92
|
+
encoding='utf-8') as f:
|
|
93
|
+
f.write(memory_env_var)
|
|
94
|
+
|
|
77
95
|
def _run(self):
|
|
96
|
+
if not os.path.exists(
|
|
97
|
+
os.path.expanduser(
|
|
98
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
|
|
99
|
+
) and not managed_job_utils.is_consolidation_mode():
|
|
100
|
+
# Note: since the skylet is started before the user setup (in
|
|
101
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
102
|
+
# before the indicator file is written. However, since we will wait
|
|
103
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
104
|
+
# unlikely.
|
|
105
|
+
logger.info('No jobs controller indicator file found.')
|
|
106
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
107
|
+
if not all_job_ids:
|
|
108
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
109
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
110
|
+
# sdk.api_stop(). (#7229)
|
|
111
|
+
try:
|
|
112
|
+
records = scheduler.get_controller_process_records()
|
|
113
|
+
if records is not None:
|
|
114
|
+
for record in records:
|
|
115
|
+
if managed_job_utils.controller_process_alive(
|
|
116
|
+
record, quiet=False):
|
|
117
|
+
subprocess_utils.kill_children_processes(
|
|
118
|
+
parent_pids=[record.pid], force=True)
|
|
119
|
+
os.remove(
|
|
120
|
+
os.path.expanduser(
|
|
121
|
+
scheduler.JOB_CONTROLLER_PID_PATH))
|
|
122
|
+
except Exception as e: # pylint: disable=broad-except
|
|
123
|
+
# in case we get perm issues or something is messed up, just
|
|
124
|
+
# ignore it and assume the process is dead
|
|
125
|
+
logger.error(
|
|
126
|
+
f'Error looking at job controller pid file: {e}')
|
|
127
|
+
pass
|
|
128
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
129
|
+
'indicator file hasn\'t been written yet.')
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
logger.info('=== Updating managed job status ===')
|
|
78
133
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
|
-
|
|
134
|
+
scheduler.maybe_start_controllers()
|
|
80
135
|
|
|
81
136
|
|
|
82
137
|
class ServiceUpdateEvent(SkyletEvent):
|
|
@@ -87,8 +142,12 @@ class ServiceUpdateEvent(SkyletEvent):
|
|
|
87
142
|
"""
|
|
88
143
|
EVENT_INTERVAL_SECONDS = 300
|
|
89
144
|
|
|
145
|
+
def __init__(self, pool: bool) -> None:
|
|
146
|
+
super().__init__()
|
|
147
|
+
self._pool = pool
|
|
148
|
+
|
|
90
149
|
def _run(self):
|
|
91
|
-
serve_utils.update_service_status()
|
|
150
|
+
serve_utils.update_service_status(self._pool)
|
|
92
151
|
|
|
93
152
|
|
|
94
153
|
class UsageHeartbeatReportEvent(SkyletEvent):
|
|
@@ -128,23 +187,37 @@ class AutostopEvent(SkyletEvent):
|
|
|
128
187
|
logger.debug('autostop_config not set. Skipped.')
|
|
129
188
|
return
|
|
130
189
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
190
|
+
ignore_idle_check = (
|
|
191
|
+
autostop_config.wait_for == autostop_lib.AutostopWaitFor.NONE)
|
|
192
|
+
is_idle = True
|
|
193
|
+
if not ignore_idle_check:
|
|
194
|
+
if not job_lib.is_cluster_idle(
|
|
195
|
+
) or managed_job_state.get_num_alive_jobs() or (
|
|
196
|
+
autostop_config.wait_for
|
|
197
|
+
== autostop_lib.AutostopWaitFor.JOBS_AND_SSH and
|
|
198
|
+
autostop_lib.has_active_ssh_sessions()):
|
|
199
|
+
is_idle = False
|
|
200
|
+
|
|
201
|
+
if ignore_idle_check or is_idle:
|
|
202
|
+
minutes_since_last_active = (
|
|
203
|
+
time.time() - autostop_lib.get_last_active_time()) // 60
|
|
135
204
|
logger.debug(
|
|
136
|
-
f'
|
|
137
|
-
f'AutoStop
|
|
205
|
+
f'Minutes since last active: {minutes_since_last_active}, '
|
|
206
|
+
f'AutoStop idle minutes: '
|
|
207
|
+
f'{autostop_config.autostop_idle_minutes}, '
|
|
208
|
+
f'Wait for: {autostop_config.wait_for.value}')
|
|
138
209
|
else:
|
|
139
210
|
autostop_lib.set_last_active_time_to_now()
|
|
140
|
-
|
|
141
|
-
logger.debug(
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
211
|
+
minutes_since_last_active = -1
|
|
212
|
+
logger.debug('Not idle. Reset idle minutes. '
|
|
213
|
+
f'AutoStop idle minutes: '
|
|
214
|
+
f'{autostop_config.autostop_idle_minutes}, '
|
|
215
|
+
f'Wait for: {autostop_config.wait_for.value}')
|
|
216
|
+
if minutes_since_last_active >= autostop_config.autostop_idle_minutes:
|
|
145
217
|
logger.info(
|
|
146
|
-
f'{
|
|
147
|
-
f'{autostop_config.autostop_idle_minutes} minutes.
|
|
218
|
+
f'{minutes_since_last_active} minute(s) since last active; '
|
|
219
|
+
f'threshold: {autostop_config.autostop_idle_minutes} minutes. '
|
|
220
|
+
f'Stopping.')
|
|
148
221
|
self._stop_cluster(autostop_config)
|
|
149
222
|
|
|
150
223
|
def _stop_cluster(self, autostop_config):
|
|
@@ -154,7 +227,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
154
227
|
|
|
155
228
|
config_path = os.path.abspath(
|
|
156
229
|
os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
|
|
157
|
-
config =
|
|
230
|
+
config = yaml_utils.read_yaml(config_path)
|
|
158
231
|
provider_name = cluster_utils.get_provider_name(config)
|
|
159
232
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
|
160
233
|
assert cloud is not None, f'Unknown cloud: {provider_name}'
|
|
@@ -249,8 +322,15 @@ class AutostopEvent(SkyletEvent):
|
|
|
249
322
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
250
323
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
251
324
|
|
|
325
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
326
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
327
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
252
328
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
253
329
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
330
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
331
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
332
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
333
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
254
334
|
|
|
255
335
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
256
336
|
# stopping/terminating of the cluster.
|
|
@@ -282,7 +362,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
282
362
|
else:
|
|
283
363
|
yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true',
|
|
284
364
|
yaml_str)
|
|
285
|
-
config =
|
|
365
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
286
366
|
# Set the private key with the existed key on the remote instance.
|
|
287
367
|
config['auth']['ssh_private_key'] = '~/ray_bootstrap_key.pem'
|
|
288
368
|
# NOTE: We must do this, otherwise with ssh_proxy_command still under
|
|
@@ -299,5 +379,5 @@ class AutostopEvent(SkyletEvent):
|
|
|
299
379
|
config['auth'].pop('ssh_proxy_command', None)
|
|
300
380
|
# Empty the file_mounts.
|
|
301
381
|
config['file_mounts'] = {}
|
|
302
|
-
|
|
382
|
+
yaml_utils.dump_yaml(yaml_path, config)
|
|
303
383
|
logger.debug('Replaced upscaling speed to 0.')
|