skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/attempt_skylet.py
CHANGED
|
@@ -1,51 +1,133 @@
|
|
|
1
1
|
"""Restarts skylet if version does not match"""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import signal
|
|
4
5
|
import subprocess
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import psutil
|
|
5
9
|
|
|
6
10
|
from sky.skylet import constants
|
|
11
|
+
from sky.skylet import runtime_utils
|
|
12
|
+
|
|
13
|
+
VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
|
|
14
|
+
SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
|
|
15
|
+
PID_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PID_FILE)
|
|
16
|
+
PORT_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PORT_FILE)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_running_skylet_process(pid: int) -> bool:
|
|
20
|
+
if pid <= 0:
|
|
21
|
+
return False
|
|
22
|
+
try:
|
|
23
|
+
process = psutil.Process(pid)
|
|
24
|
+
if not process.is_running():
|
|
25
|
+
return False
|
|
26
|
+
# Check if command line contains the skylet module identifier
|
|
27
|
+
cmdline = process.cmdline()
|
|
28
|
+
return any('sky.skylet.skylet' in arg for arg in cmdline)
|
|
29
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
|
|
30
|
+
OSError) as e:
|
|
31
|
+
print(f'Error checking if skylet process {pid} is running: {e}')
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _find_running_skylet_pids() -> List[int]:
|
|
36
|
+
if os.path.exists(PID_FILE):
|
|
37
|
+
try:
|
|
38
|
+
with open(PID_FILE, 'r', encoding='utf-8') as pid_file:
|
|
39
|
+
pid = int(pid_file.read().strip())
|
|
40
|
+
if _is_running_skylet_process(pid):
|
|
41
|
+
return [pid]
|
|
42
|
+
except (OSError, ValueError, IOError) as e:
|
|
43
|
+
# Don't fallback to grep-based detection as the existence of the
|
|
44
|
+
# PID file implies that we are on the new version, and there is
|
|
45
|
+
# possibility of there being multiple skylet processes running,
|
|
46
|
+
# and we don't want to accidentally kill the wrong skylet(s).
|
|
47
|
+
print(f'Error reading PID file {PID_FILE}: {e}')
|
|
48
|
+
return []
|
|
49
|
+
else:
|
|
50
|
+
# Fall back to grep-based detection for backward compatibility.
|
|
51
|
+
pids = []
|
|
52
|
+
# We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
|
|
53
|
+
# because need to handle the backward compatibility of the old skylet
|
|
54
|
+
# started before #3326, which does not use the full path to python.
|
|
55
|
+
proc = subprocess.run(
|
|
56
|
+
'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
|
|
57
|
+
shell=True,
|
|
58
|
+
check=False,
|
|
59
|
+
capture_output=True,
|
|
60
|
+
text=True)
|
|
61
|
+
if proc.returncode == 0:
|
|
62
|
+
# Parse the output to extract PIDs (column 2)
|
|
63
|
+
for line in proc.stdout.strip().split('\n'):
|
|
64
|
+
if line:
|
|
65
|
+
parts = line.split()
|
|
66
|
+
if len(parts) >= 2:
|
|
67
|
+
try:
|
|
68
|
+
pids.append(int(parts[1]))
|
|
69
|
+
except ValueError:
|
|
70
|
+
continue
|
|
71
|
+
return pids
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _check_version_match() -> Tuple[bool, Optional[str]]:
|
|
75
|
+
"""Check if the version file matches the current skylet version.
|
|
7
76
|
|
|
8
|
-
|
|
77
|
+
Returns:
|
|
78
|
+
Tuple of (version_match: bool, version: str or None)
|
|
79
|
+
"""
|
|
80
|
+
version: Optional[str] = None
|
|
81
|
+
if os.path.exists(VERSION_FILE):
|
|
82
|
+
try:
|
|
83
|
+
with open(VERSION_FILE, 'r', encoding='utf-8') as f:
|
|
84
|
+
version = f.read().strip()
|
|
85
|
+
return version == constants.SKYLET_VERSION, version
|
|
86
|
+
except (OSError, IOError):
|
|
87
|
+
pass
|
|
88
|
+
return False, version
|
|
9
89
|
|
|
10
90
|
|
|
11
91
|
def restart_skylet():
|
|
12
92
|
# Kills old skylet if it is running.
|
|
13
93
|
# TODO(zhwu): make the killing graceful, e.g., use a signal to tell
|
|
14
94
|
# skylet to exit, instead of directly killing it.
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
95
|
+
|
|
96
|
+
# Find and kill running skylet processes
|
|
97
|
+
for pid in _find_running_skylet_pids():
|
|
98
|
+
try:
|
|
99
|
+
os.kill(pid, signal.SIGKILL)
|
|
100
|
+
except (OSError, ProcessLookupError):
|
|
101
|
+
# Process died between detection and kill
|
|
102
|
+
pass
|
|
103
|
+
# Clean up the PID file
|
|
104
|
+
try:
|
|
105
|
+
os.remove(PID_FILE)
|
|
106
|
+
except OSError:
|
|
107
|
+
pass # Best effort cleanup
|
|
108
|
+
|
|
109
|
+
port = constants.SKYLET_GRPC_PORT
|
|
23
110
|
subprocess.run(
|
|
24
111
|
# We have made sure that `attempt_skylet.py` is executed with the
|
|
25
112
|
# skypilot runtime env activated, so that skylet can access the cloud
|
|
26
113
|
# CLI tools.
|
|
27
|
-
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
|
|
28
|
-
'
|
|
114
|
+
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet '
|
|
115
|
+
f'--port={port} '
|
|
116
|
+
f'>> {SKYLET_LOG_FILE} 2>&1 & echo $! > {PID_FILE}',
|
|
29
117
|
shell=True,
|
|
30
118
|
check=True)
|
|
119
|
+
|
|
120
|
+
with open(PORT_FILE, 'w', encoding='utf-8') as pf:
|
|
121
|
+
pf.write(str(port))
|
|
122
|
+
|
|
31
123
|
with open(VERSION_FILE, 'w', encoding='utf-8') as v_f:
|
|
32
124
|
v_f.write(constants.SKYLET_VERSION)
|
|
33
125
|
|
|
34
126
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
shell=True,
|
|
38
|
-
check=False)
|
|
39
|
-
|
|
40
|
-
running = (proc.returncode == 0)
|
|
127
|
+
# Check if our skylet is running
|
|
128
|
+
running = bool(_find_running_skylet_pids())
|
|
41
129
|
|
|
42
|
-
version_match =
|
|
43
|
-
found_version = None
|
|
44
|
-
if os.path.exists(VERSION_FILE):
|
|
45
|
-
with open(VERSION_FILE, 'r', encoding='utf-8') as f:
|
|
46
|
-
found_version = f.read().strip()
|
|
47
|
-
if found_version == constants.SKYLET_VERSION:
|
|
48
|
-
version_match = True
|
|
130
|
+
version_match, found_version = _check_version_match()
|
|
49
131
|
|
|
50
132
|
version_string = (f' (found version {found_version}, new version '
|
|
51
133
|
f'{constants.SKYLET_VERSION})')
|
sky/skylet/autostop_lib.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Autostop utilities."""
|
|
2
|
+
import enum
|
|
2
3
|
import pickle
|
|
3
4
|
import shlex
|
|
5
|
+
import subprocess
|
|
4
6
|
import time
|
|
5
7
|
import typing
|
|
6
8
|
from typing import List, Optional
|
|
@@ -10,11 +12,17 @@ from sky.adaptors import common as adaptors_common
|
|
|
10
12
|
from sky.skylet import configs
|
|
11
13
|
from sky.skylet import constants
|
|
12
14
|
from sky.utils import message_utils
|
|
15
|
+
from sky.utils import ux_utils
|
|
13
16
|
|
|
14
17
|
if typing.TYPE_CHECKING:
|
|
15
18
|
import psutil
|
|
19
|
+
|
|
20
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
16
21
|
else:
|
|
17
22
|
psutil = adaptors_common.LazyImport('psutil')
|
|
23
|
+
# To avoid requiring protobuf to be installed on the client side.
|
|
24
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
25
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
18
26
|
|
|
19
27
|
logger = sky_logging.init_logger(__name__)
|
|
20
28
|
|
|
@@ -30,6 +38,83 @@ _AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
|
|
|
30
38
|
_AUTOSTOP_INDICATOR = 'autostop_indicator'
|
|
31
39
|
|
|
32
40
|
|
|
41
|
+
class AutostopWaitFor(enum.Enum):
|
|
42
|
+
"""Enum for the Autostop behaviour.
|
|
43
|
+
|
|
44
|
+
JOBS: Wait for jobs to finish.
|
|
45
|
+
JOBS_AND_SSH: Wait for jobs to finish and all SSH sessions to be closed.
|
|
46
|
+
NONE: Unconditionally stop the cluster after the idle time.
|
|
47
|
+
"""
|
|
48
|
+
JOBS_AND_SSH = 'jobs_and_ssh'
|
|
49
|
+
JOBS = 'jobs'
|
|
50
|
+
NONE = 'none'
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def supported_modes(cls) -> List[str]:
|
|
54
|
+
return [mode.value for mode in cls]
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def cli_help_message(cls, pair: str) -> str:
|
|
58
|
+
return f"""\
|
|
59
|
+
Determines the condition for resetting the idleness timer.
|
|
60
|
+
This option works in conjunction with ``--{pair}``. Options:
|
|
61
|
+
|
|
62
|
+
\b
|
|
63
|
+
1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
|
|
64
|
+
2. ``jobs``: Only wait for in-progress jobs.
|
|
65
|
+
3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_str(cls, mode: str) -> 'AutostopWaitFor':
|
|
69
|
+
"""Returns the enum value for the given string."""
|
|
70
|
+
if mode.lower() == cls.JOBS.value:
|
|
71
|
+
return cls.JOBS
|
|
72
|
+
elif mode.lower() == cls.JOBS_AND_SSH.value:
|
|
73
|
+
return cls.JOBS_AND_SSH
|
|
74
|
+
elif mode.lower() == cls.NONE.value:
|
|
75
|
+
return cls.NONE
|
|
76
|
+
else:
|
|
77
|
+
with ux_utils.print_exception_no_traceback():
|
|
78
|
+
raise ValueError(f'Unsupported autostop wait mode: '
|
|
79
|
+
f'{mode}. The mode must be either '
|
|
80
|
+
f'\'{cls.JOBS_AND_SSH.value}\', '
|
|
81
|
+
f'\'{cls.JOBS.value}\', or '
|
|
82
|
+
f'\'{cls.NONE.value}\'. ')
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_protobuf(
|
|
86
|
+
cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
|
|
87
|
+
) -> Optional['AutostopWaitFor']:
|
|
88
|
+
"""Convert protobuf AutostopWaitFor enum to Python enum value."""
|
|
89
|
+
protobuf_to_enum = {
|
|
90
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
|
|
91
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
|
|
92
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
|
|
93
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
|
|
94
|
+
}
|
|
95
|
+
if protobuf_value not in protobuf_to_enum:
|
|
96
|
+
with ux_utils.print_exception_no_traceback():
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
|
|
99
|
+
return protobuf_to_enum[protobuf_value]
|
|
100
|
+
|
|
101
|
+
def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
|
|
102
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
103
|
+
enum_to_protobuf = {
|
|
104
|
+
AutostopWaitFor.JOBS_AND_SSH:
|
|
105
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
|
|
106
|
+
AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
|
|
107
|
+
AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
|
|
108
|
+
}
|
|
109
|
+
if self not in enum_to_protobuf:
|
|
110
|
+
with ux_utils.print_exception_no_traceback():
|
|
111
|
+
raise ValueError(f'Unknown AutostopWaitFor value: {self}')
|
|
112
|
+
return enum_to_protobuf[self]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
|
|
116
|
+
|
|
117
|
+
|
|
33
118
|
class AutostopConfig:
|
|
34
119
|
"""Autostop configuration."""
|
|
35
120
|
|
|
@@ -37,12 +122,14 @@ class AutostopConfig:
|
|
|
37
122
|
autostop_idle_minutes: int,
|
|
38
123
|
boot_time: float,
|
|
39
124
|
backend: Optional[str],
|
|
125
|
+
wait_for: AutostopWaitFor,
|
|
40
126
|
down: bool = False):
|
|
41
127
|
assert autostop_idle_minutes < 0 or backend is not None, (
|
|
42
128
|
autostop_idle_minutes, backend)
|
|
43
129
|
self.autostop_idle_minutes = autostop_idle_minutes
|
|
44
130
|
self.boot_time = boot_time
|
|
45
131
|
self.backend = backend
|
|
132
|
+
self.wait_for = wait_for
|
|
46
133
|
self.down = down
|
|
47
134
|
|
|
48
135
|
def __setstate__(self, state: dict):
|
|
@@ -53,15 +140,18 @@ class AutostopConfig:
|
|
|
53
140
|
def get_autostop_config() -> AutostopConfig:
|
|
54
141
|
config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
|
|
55
142
|
if config_str is None:
|
|
56
|
-
return AutostopConfig(-1, -1, None)
|
|
143
|
+
return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
|
|
57
144
|
return pickle.loads(config_str)
|
|
58
145
|
|
|
59
146
|
|
|
60
|
-
def set_autostop(idle_minutes: int, backend: Optional[str],
|
|
147
|
+
def set_autostop(idle_minutes: int, backend: Optional[str],
|
|
148
|
+
wait_for: AutostopWaitFor, down: bool) -> None:
|
|
61
149
|
boot_time = psutil.boot_time()
|
|
62
|
-
autostop_config = AutostopConfig(idle_minutes, boot_time, backend,
|
|
150
|
+
autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
|
|
151
|
+
down)
|
|
63
152
|
configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
|
|
64
|
-
logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}
|
|
153
|
+
logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
|
|
154
|
+
f'wait_for {wait_for.value}.')
|
|
65
155
|
# Reset timer whenever an autostop setting is submitted, i.e. the idle
|
|
66
156
|
# time will be counted from now.
|
|
67
157
|
set_last_active_time_to_now()
|
|
@@ -107,6 +197,28 @@ def set_last_active_time_to_now() -> None:
|
|
|
107
197
|
configs.set_config(_AUTOSTOP_LAST_ACTIVE_TIME, str(time.time()))
|
|
108
198
|
|
|
109
199
|
|
|
200
|
+
def has_active_ssh_sessions() -> bool:
|
|
201
|
+
"""Returns True if there are any active SSH sessions on the node."""
|
|
202
|
+
try:
|
|
203
|
+
# /dev/pts is a virtual filesystem that contains the pseudo-terminal
|
|
204
|
+
# devices. ptmx is the pseudo-terminal multiplexer, which is the
|
|
205
|
+
# "master" device that creates new pseudo-terminal devices, so we
|
|
206
|
+
# exclude it from the count.
|
|
207
|
+
proc = subprocess.run('ls /dev/pts | grep -v ptmx | wc -l',
|
|
208
|
+
capture_output=True,
|
|
209
|
+
text=True,
|
|
210
|
+
check=False,
|
|
211
|
+
shell=True)
|
|
212
|
+
if proc.returncode != 0:
|
|
213
|
+
logger.warning(f'SSH session check command failed with return code '
|
|
214
|
+
f'{proc.returncode}.')
|
|
215
|
+
return False
|
|
216
|
+
return int(proc.stdout.strip()) > 0
|
|
217
|
+
except Exception as e: # pylint: disable=broad-except
|
|
218
|
+
logger.warning(f'Error checking active SSH sessions: {e}.')
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
|
|
110
222
|
class AutostopCodeGen:
|
|
111
223
|
"""Code generator for autostop utility functions.
|
|
112
224
|
|
|
@@ -114,13 +226,22 @@ class AutostopCodeGen:
|
|
|
114
226
|
|
|
115
227
|
>> codegen = AutostopCodeGen.set_autostop(...)
|
|
116
228
|
"""
|
|
117
|
-
_PREFIX = ['from sky.skylet import autostop_lib']
|
|
229
|
+
_PREFIX = ['from sky.skylet import autostop_lib, constants']
|
|
118
230
|
|
|
119
231
|
@classmethod
|
|
120
|
-
def set_autostop(cls,
|
|
232
|
+
def set_autostop(cls,
|
|
233
|
+
idle_minutes: int,
|
|
234
|
+
backend: str,
|
|
235
|
+
wait_for: Optional[AutostopWaitFor],
|
|
236
|
+
down: bool = False) -> str:
|
|
237
|
+
if wait_for is None:
|
|
238
|
+
wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
|
|
121
239
|
code = [
|
|
122
|
-
f'
|
|
123
|
-
f' {down})'
|
|
240
|
+
f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
|
|
241
|
+
f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
|
|
242
|
+
f'\nelse: '
|
|
243
|
+
f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
|
|
244
|
+
f'autostop_lib.{wait_for}, {down})',
|
|
124
245
|
]
|
|
125
246
|
return cls._build(code)
|
|
126
247
|
|
sky/skylet/configs.py
CHANGED
|
@@ -2,17 +2,17 @@
|
|
|
2
2
|
import functools
|
|
3
3
|
import os
|
|
4
4
|
import pathlib
|
|
5
|
+
import threading
|
|
5
6
|
from typing import Callable, Optional, Union
|
|
6
7
|
|
|
7
|
-
from sky.
|
|
8
|
+
from sky.skylet import runtime_utils
|
|
9
|
+
from sky.utils.db import db_utils
|
|
8
10
|
|
|
9
|
-
_DB_PATH =
|
|
10
|
-
|
|
11
|
+
_DB_PATH = None
|
|
12
|
+
_db_init_lock = threading.Lock()
|
|
11
13
|
|
|
12
|
-
_table_created = False
|
|
13
14
|
|
|
14
|
-
|
|
15
|
-
def ensure_table(func: Callable):
|
|
15
|
+
def init_db(func: Callable):
|
|
16
16
|
"""Ensure the table exists before calling the function.
|
|
17
17
|
|
|
18
18
|
Since this module will be imported whenever `sky` is imported (due to
|
|
@@ -24,25 +24,33 @@ def ensure_table(func: Callable):
|
|
|
24
24
|
|
|
25
25
|
@functools.wraps(func)
|
|
26
26
|
def wrapper(*args, **kwargs):
|
|
27
|
-
global
|
|
28
|
-
if not
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
27
|
+
global _DB_PATH
|
|
28
|
+
if _DB_PATH is not None:
|
|
29
|
+
return func(*args, **kwargs)
|
|
30
|
+
|
|
31
|
+
with _db_init_lock:
|
|
32
|
+
if _DB_PATH is None:
|
|
33
|
+
_DB_PATH = runtime_utils.get_runtime_dir_path(
|
|
34
|
+
'.sky/skylet_config.db')
|
|
35
|
+
os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
|
|
36
|
+
with db_utils.safe_cursor(
|
|
37
|
+
_DB_PATH
|
|
38
|
+
) as c: # Call it 'c' to avoid pylint complaining.
|
|
39
|
+
# Use WAL mode to avoid locking problem in #1507.
|
|
40
|
+
# Reference: https://stackoverflow.com/a/39265148
|
|
41
|
+
c.execute('PRAGMA journal_mode=WAL')
|
|
42
|
+
c.execute("""\
|
|
43
|
+
CREATE TABLE IF NOT EXISTS config (
|
|
44
|
+
key TEXT PRIMARY KEY,
|
|
45
|
+
value TEXT)""")
|
|
39
46
|
return func(*args, **kwargs)
|
|
40
47
|
|
|
41
48
|
return wrapper
|
|
42
49
|
|
|
43
50
|
|
|
44
|
-
@
|
|
51
|
+
@init_db
|
|
45
52
|
def get_config(key: str) -> Optional[bytes]:
|
|
53
|
+
assert _DB_PATH is not None
|
|
46
54
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
47
55
|
rows = cursor.execute('SELECT value FROM config WHERE key = ?', (key,))
|
|
48
56
|
for (value,) in rows:
|
|
@@ -50,8 +58,9 @@ def get_config(key: str) -> Optional[bytes]:
|
|
|
50
58
|
return None
|
|
51
59
|
|
|
52
60
|
|
|
53
|
-
@
|
|
61
|
+
@init_db
|
|
54
62
|
def set_config(key: str, value: Union[bytes, str]) -> None:
|
|
63
|
+
assert _DB_PATH is not None
|
|
55
64
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
56
65
|
cursor.execute(
|
|
57
66
|
"""\
|