skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/state.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""State for API server process."""
|
|
2
|
+
|
|
3
|
+
# This state is used to block requests except /api operations, which is useful
|
|
4
|
+
# when a server is shutting down: new requests will be blocked, but existing
|
|
5
|
+
# requests will be allowed to finish and be operated via /api operations, e.g.
|
|
6
|
+
# /api/logs, /api/cancel, etc.
|
|
7
|
+
_block_requests = False
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# TODO(aylei): refactor, state should be a instance property of API server app
|
|
11
|
+
# instead of a global variable.
|
|
12
|
+
def get_block_requests() -> bool:
|
|
13
|
+
"""Whether block requests except /api operations."""
|
|
14
|
+
return _block_requests
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def set_block_requests(shutting_down: bool) -> None:
|
|
18
|
+
"""Set the API server to block requests except /api operations."""
|
|
19
|
+
global _block_requests
|
|
20
|
+
_block_requests = shutting_down
|
sky/server/stream_utils.py
CHANGED
|
@@ -3,18 +3,37 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import collections
|
|
5
5
|
import pathlib
|
|
6
|
-
from typing import AsyncGenerator, Deque, Optional
|
|
6
|
+
from typing import AsyncGenerator, Deque, List, Optional
|
|
7
7
|
|
|
8
8
|
import aiofiles
|
|
9
9
|
import fastapi
|
|
10
10
|
|
|
11
|
+
from sky import global_user_state
|
|
11
12
|
from sky import sky_logging
|
|
12
13
|
from sky.server.requests import requests as requests_lib
|
|
14
|
+
from sky.utils import common_utils
|
|
13
15
|
from sky.utils import message_utils
|
|
14
16
|
from sky.utils import rich_utils
|
|
17
|
+
from sky.utils import status_lib
|
|
15
18
|
|
|
16
19
|
logger = sky_logging.init_logger(__name__)
|
|
17
20
|
|
|
21
|
+
# When streaming log lines, buffer the lines in memory and flush them in chunks
|
|
22
|
+
# to improve log tailing throughput. Buffer size is the max size bytes of each
|
|
23
|
+
# chunk and the timeout threshold for flushing the buffer to ensure
|
|
24
|
+
# responsiveness.
|
|
25
|
+
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
26
|
+
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
27
|
+
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
_READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
|
|
29
|
+
|
|
30
|
+
# If a SHORT request has been stuck in pending for
|
|
31
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
32
|
+
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
33
|
+
|
|
34
|
+
LONG_REQUEST_POLL_INTERVAL = 1
|
|
35
|
+
DEFAULT_POLL_INTERVAL = 0.1
|
|
36
|
+
|
|
18
37
|
|
|
19
38
|
async def _yield_log_file_with_payloads_skipped(
|
|
20
39
|
log_file) -> AsyncGenerator[str, None]:
|
|
@@ -29,25 +48,51 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
29
48
|
yield line_str
|
|
30
49
|
|
|
31
50
|
|
|
32
|
-
async def log_streamer(
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
51
|
+
async def log_streamer(
|
|
52
|
+
request_id: Optional[str],
|
|
53
|
+
log_path: Optional[pathlib.Path] = None,
|
|
54
|
+
plain_logs: bool = False,
|
|
55
|
+
tail: Optional[int] = None,
|
|
56
|
+
follow: bool = True,
|
|
57
|
+
cluster_name: Optional[str] = None,
|
|
58
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
59
|
+
) -> AsyncGenerator[str, None]:
|
|
60
|
+
"""Streams the logs of a request.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
request_id: The request ID to check whether the log tailing process
|
|
64
|
+
should be stopped.
|
|
65
|
+
log_path: The path to the log file or directory containing the log
|
|
66
|
+
files. If it is a directory, all *.log files in the directory will be
|
|
67
|
+
streamed.
|
|
68
|
+
plain_logs: Whether to show plain logs.
|
|
69
|
+
tail: The number of lines to tail. If None, tail the whole file.
|
|
70
|
+
follow: Whether to follow the log file.
|
|
71
|
+
cluster_name: The cluster name to check status for provision logs.
|
|
72
|
+
If provided and cluster status is UP, streaming will terminate.
|
|
73
|
+
"""
|
|
38
74
|
|
|
39
75
|
if request_id is not None:
|
|
76
|
+
start_time = asyncio.get_event_loop().time()
|
|
40
77
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
41
78
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
42
|
-
request_task = requests_lib.
|
|
79
|
+
request_task = await requests_lib.get_request_async(request_id,
|
|
80
|
+
fields=[
|
|
81
|
+
'request_id',
|
|
82
|
+
'name',
|
|
83
|
+
'schedule_type',
|
|
84
|
+
'status',
|
|
85
|
+
'status_msg'
|
|
86
|
+
])
|
|
43
87
|
|
|
44
88
|
if request_task is None:
|
|
45
89
|
raise fastapi.HTTPException(
|
|
46
90
|
status_code=404, detail=f'Request {request_id} not found')
|
|
47
91
|
request_id = request_task.request_id
|
|
48
92
|
|
|
49
|
-
#
|
|
50
|
-
# request
|
|
93
|
+
# By default, do not show the waiting spinner for SHORT requests.
|
|
94
|
+
# If the request has been stuck in pending for
|
|
95
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
51
96
|
show_request_waiting_spinner = (not plain_logs and
|
|
52
97
|
request_task.schedule_type
|
|
53
98
|
== requests_lib.ScheduleType.LONG)
|
|
@@ -58,9 +103,25 @@ async def log_streamer(request_id: Optional[str],
|
|
|
58
103
|
last_waiting_msg = ''
|
|
59
104
|
waiting_msg = (f'Waiting for {request_task.name!r} request to be '
|
|
60
105
|
f'scheduled: {request_id}')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
106
|
+
req_status = request_task.status
|
|
107
|
+
req_msg = request_task.status_msg
|
|
108
|
+
del request_task
|
|
109
|
+
# Slowly back off the database polling up to every 1 second, to avoid
|
|
110
|
+
# overloading the CPU and DB.
|
|
111
|
+
backoff = common_utils.Backoff(initial_backoff=polling_interval,
|
|
112
|
+
max_backoff_factor=10,
|
|
113
|
+
multiplier=1.2)
|
|
114
|
+
while req_status < requests_lib.RequestStatus.RUNNING:
|
|
115
|
+
current_time = asyncio.get_event_loop().time()
|
|
116
|
+
# Show the waiting spinner for a SHORT request if it has been stuck
|
|
117
|
+
# in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
|
|
118
|
+
if not show_request_waiting_spinner and (
|
|
119
|
+
current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
|
|
120
|
+
show_request_waiting_spinner = True
|
|
121
|
+
yield status_msg.init()
|
|
122
|
+
yield status_msg.start()
|
|
123
|
+
if req_msg is not None:
|
|
124
|
+
waiting_msg = req_msg
|
|
64
125
|
if show_request_waiting_spinner:
|
|
65
126
|
yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
|
|
66
127
|
elif plain_logs and waiting_msg != last_waiting_msg:
|
|
@@ -69,73 +130,278 @@ async def log_streamer(request_id: Optional[str],
|
|
|
69
130
|
# Use smaller padding (1024 bytes) to force browser rendering
|
|
70
131
|
yield f'{waiting_msg}' + ' ' * 4096 + '\n'
|
|
71
132
|
# Sleep shortly to avoid storming the DB and CPU and allow other
|
|
72
|
-
# coroutines to run.
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
133
|
+
# coroutines to run.
|
|
134
|
+
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
135
|
+
# polling the DB, which can be a bottleneck for high-concurrency
|
|
136
|
+
# requests.
|
|
137
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
138
|
+
status_with_msg = await requests_lib.get_request_status_async(
|
|
139
|
+
request_id, include_msg=True)
|
|
140
|
+
req_status = status_with_msg.status
|
|
141
|
+
req_msg = status_with_msg.status_msg
|
|
76
142
|
if not follow:
|
|
77
143
|
break
|
|
78
144
|
if show_request_waiting_spinner:
|
|
79
145
|
yield status_msg.stop()
|
|
80
146
|
|
|
81
|
-
#
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
147
|
+
# worker node provision logs
|
|
148
|
+
if log_path is not None and log_path.is_dir():
|
|
149
|
+
# Get all *.log files in the log_path dir
|
|
150
|
+
log_files = sorted(log_path.glob('*.log'))
|
|
151
|
+
|
|
152
|
+
for log_file_path in log_files:
|
|
153
|
+
# Add header before each file (similar to tail -f behavior)
|
|
154
|
+
header = f'\n==> {log_file_path} <==\n\n'
|
|
155
|
+
yield header
|
|
156
|
+
|
|
157
|
+
async with aiofiles.open(log_file_path, 'rb') as f:
|
|
158
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs,
|
|
159
|
+
tail, follow, cluster_name,
|
|
160
|
+
polling_interval):
|
|
161
|
+
yield chunk
|
|
162
|
+
|
|
163
|
+
# api server request logs (if request_id is provided) or
|
|
164
|
+
# head node provision logs (if cluster_name is provided)
|
|
165
|
+
else:
|
|
166
|
+
assert log_path is not None, (request_id, cluster_name)
|
|
167
|
+
async with aiofiles.open(log_path, 'rb') as f:
|
|
168
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
169
|
+
follow, cluster_name,
|
|
170
|
+
polling_interval):
|
|
171
|
+
yield chunk
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def _tail_log_file(
|
|
175
|
+
f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
176
|
+
request_id: Optional[str] = None,
|
|
177
|
+
plain_logs: bool = False,
|
|
178
|
+
tail: Optional[int] = None,
|
|
179
|
+
follow: bool = True,
|
|
180
|
+
cluster_name: Optional[str] = None,
|
|
181
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
182
|
+
) -> AsyncGenerator[str, None]:
|
|
183
|
+
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
|
184
|
+
|
|
185
|
+
if tail is not None:
|
|
186
|
+
# Find last n lines of the log file. Do not read the whole file into
|
|
187
|
+
# memory.
|
|
188
|
+
# TODO(zhwu): this will include the control lines for rich status,
|
|
189
|
+
# which may not lead to exact tail lines when showing on the client
|
|
190
|
+
# side.
|
|
191
|
+
lines: Deque[str] = collections.deque(maxlen=tail)
|
|
192
|
+
async for line_str in _yield_log_file_with_payloads_skipped(f):
|
|
193
|
+
lines.append(line_str)
|
|
194
|
+
for line_str in lines:
|
|
195
|
+
yield line_str
|
|
196
|
+
|
|
197
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
198
|
+
last_status_check_time = asyncio.get_event_loop().time()
|
|
199
|
+
|
|
200
|
+
# Buffer the lines in memory and flush them in chunks to improve log
|
|
201
|
+
# tailing throughput.
|
|
202
|
+
buffer: List[str] = []
|
|
203
|
+
buffer_bytes = 0
|
|
204
|
+
last_flush_time = asyncio.get_event_loop().time()
|
|
205
|
+
|
|
206
|
+
# Read file in chunks instead of line-by-line for better performance
|
|
207
|
+
incomplete_line = b'' # Buffer for incomplete lines across chunks
|
|
208
|
+
|
|
209
|
+
async def flush_buffer() -> AsyncGenerator[str, None]:
|
|
210
|
+
nonlocal buffer, buffer_bytes, last_flush_time
|
|
211
|
+
if buffer:
|
|
212
|
+
yield ''.join(buffer)
|
|
213
|
+
buffer.clear()
|
|
214
|
+
buffer_bytes = 0
|
|
215
|
+
last_flush_time = asyncio.get_event_loop().time()
|
|
216
|
+
|
|
217
|
+
while True:
|
|
218
|
+
# Sleep 0 to yield control to allow other coroutines to run,
|
|
219
|
+
# while keeps the loop tight to make log stream responsive.
|
|
220
|
+
await asyncio.sleep(0)
|
|
221
|
+
current_time = asyncio.get_event_loop().time()
|
|
222
|
+
# Flush the buffer when it is not empty and the buffer is full or the
|
|
223
|
+
# flush timeout is reached.
|
|
224
|
+
if buffer and (buffer_bytes >= _BUFFER_SIZE or
|
|
225
|
+
(current_time - last_flush_time) >= _BUFFER_TIMEOUT):
|
|
226
|
+
async for chunk in flush_buffer():
|
|
227
|
+
yield chunk
|
|
228
|
+
|
|
229
|
+
# Read file in chunks for better I/O performance
|
|
230
|
+
file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
|
|
231
|
+
if not file_chunk:
|
|
232
|
+
# Process any remaining incomplete line
|
|
233
|
+
if incomplete_line:
|
|
234
|
+
line_str = incomplete_line.decode('utf-8')
|
|
235
|
+
if plain_logs:
|
|
236
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
237
|
+
line_str, raise_for_mismatch=False)
|
|
238
|
+
if not is_payload:
|
|
239
|
+
buffer.append(line_str)
|
|
240
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
241
|
+
else:
|
|
242
|
+
buffer.append(line_str)
|
|
243
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
244
|
+
incomplete_line = b''
|
|
245
|
+
|
|
246
|
+
# Avoid checking the status too frequently to avoid overloading the
|
|
247
|
+
# DB.
|
|
248
|
+
should_check_status = (current_time -
|
|
249
|
+
last_status_check_time) >= polling_interval
|
|
250
|
+
if not follow:
|
|
251
|
+
# We will only hit this path once, but we should make sure to
|
|
252
|
+
# check the status so that we display the final request status
|
|
253
|
+
# if the request is complete.
|
|
254
|
+
should_check_status = True
|
|
255
|
+
if request_id is not None and should_check_status:
|
|
256
|
+
last_status_check_time = current_time
|
|
257
|
+
req_status = await requests_lib.get_request_status_async(
|
|
258
|
+
request_id)
|
|
259
|
+
if req_status.status > requests_lib.RequestStatus.RUNNING:
|
|
260
|
+
if (req_status.status ==
|
|
261
|
+
requests_lib.RequestStatus.CANCELLED):
|
|
262
|
+
request_task = await requests_lib.get_request_async(
|
|
263
|
+
request_id, fields=['name', 'should_retry'])
|
|
264
|
+
if request_task.should_retry:
|
|
265
|
+
buffer.append(
|
|
266
|
+
message_utils.encode_payload(
|
|
267
|
+
rich_utils.Control.RETRY.encode('')))
|
|
268
|
+
else:
|
|
269
|
+
buffer.append(
|
|
270
|
+
f'{request_task.name!r} request {request_id}'
|
|
271
|
+
' cancelled\n')
|
|
272
|
+
del request_task
|
|
108
273
|
break
|
|
109
|
-
|
|
110
|
-
#
|
|
111
|
-
# for
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
274
|
+
if not follow:
|
|
275
|
+
# The below checks (cluster status, heartbeat) are not needed
|
|
276
|
+
# for non-follow logs.
|
|
277
|
+
break
|
|
278
|
+
# Provision logs pass in cluster_name, check cluster status
|
|
279
|
+
# periodically to see if provisioning is done.
|
|
280
|
+
if cluster_name is not None:
|
|
281
|
+
if should_check_status:
|
|
282
|
+
last_status_check_time = current_time
|
|
283
|
+
cluster_status = await (
|
|
284
|
+
global_user_state.get_status_from_cluster_name_async(
|
|
285
|
+
cluster_name))
|
|
286
|
+
if cluster_status is None:
|
|
287
|
+
logger.debug(
|
|
288
|
+
'Stop tailing provision logs for cluster'
|
|
289
|
+
f' status for cluster {cluster_name} not found')
|
|
290
|
+
break
|
|
291
|
+
# if the cluster is not in INIT state (UP or STOPPED),
|
|
292
|
+
# stop tailing provision logs
|
|
293
|
+
if cluster_status != status_lib.ClusterStatus.INIT:
|
|
294
|
+
logger.debug(
|
|
295
|
+
f'Stop tailing provision logs for cluster'
|
|
296
|
+
f' {cluster_name} has status {cluster_status} '
|
|
297
|
+
'(not in INIT state)')
|
|
298
|
+
break
|
|
299
|
+
req_filter = requests_lib.RequestTaskFilter(
|
|
300
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
301
|
+
cluster_names=[cluster_name],
|
|
302
|
+
include_request_names=['sky.launch'],
|
|
303
|
+
fields=['cluster_name'])
|
|
304
|
+
req_tasks = await requests_lib.get_request_tasks_async(
|
|
305
|
+
req_filter)
|
|
306
|
+
# if the cluster is in INIT state and there is no ongoing
|
|
307
|
+
# launch request, stop tailing provision logs
|
|
308
|
+
if len(req_tasks) == 0:
|
|
309
|
+
break
|
|
310
|
+
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
311
|
+
# Currently just used to keep the connection busy, refer to
|
|
312
|
+
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
|
313
|
+
# more details.
|
|
314
|
+
buffer.append(
|
|
315
|
+
message_utils.encode_payload(
|
|
316
|
+
rich_utils.Control.HEARTBEAT.encode('')))
|
|
317
|
+
last_heartbeat_time = current_time
|
|
318
|
+
|
|
319
|
+
# Sleep shortly to avoid storming the DB and CPU, this has
|
|
320
|
+
# little impact on the responsivness here since we are waiting
|
|
321
|
+
# for a new line to come in.
|
|
322
|
+
await asyncio.sleep(0.1)
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
# Refresh the heartbeat time, this is a trivial optimization for
|
|
326
|
+
# performance but it helps avoid unnecessary heartbeat strings
|
|
327
|
+
# being printed when the client runs in an old version.
|
|
328
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
329
|
+
|
|
330
|
+
# Combine with any incomplete line from previous chunk
|
|
331
|
+
file_chunk = incomplete_line + file_chunk
|
|
332
|
+
incomplete_line = b''
|
|
333
|
+
|
|
334
|
+
# Split chunk into lines, preserving line structure
|
|
335
|
+
lines_bytes = file_chunk.split(b'\n')
|
|
336
|
+
|
|
337
|
+
# If chunk doesn't end with newline, the last element is incomplete
|
|
338
|
+
if file_chunk and not file_chunk.endswith(b'\n'):
|
|
339
|
+
incomplete_line = lines_bytes[-1]
|
|
340
|
+
lines_bytes = lines_bytes[:-1]
|
|
341
|
+
else:
|
|
342
|
+
# If ends with \n, split creates an empty last element we should
|
|
343
|
+
# ignore
|
|
344
|
+
if lines_bytes and lines_bytes[-1] == b'':
|
|
345
|
+
lines_bytes = lines_bytes[:-1]
|
|
346
|
+
|
|
347
|
+
# Process all complete lines in this chunk
|
|
348
|
+
for line_bytes in lines_bytes:
|
|
349
|
+
# Reconstruct line with newline (since split removed it)
|
|
350
|
+
line_str = line_bytes.decode('utf-8') + '\n'
|
|
351
|
+
|
|
115
352
|
if plain_logs:
|
|
116
353
|
is_payload, line_str = message_utils.decode_payload(
|
|
117
354
|
line_str, raise_for_mismatch=False)
|
|
355
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
|
356
|
+
# sending invisible characters might be okay.
|
|
118
357
|
if is_payload:
|
|
119
358
|
continue
|
|
120
|
-
|
|
359
|
+
|
|
360
|
+
buffer.append(line_str)
|
|
361
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
362
|
+
|
|
363
|
+
# Flush remaining lines in the buffer.
|
|
364
|
+
async for chunk in flush_buffer():
|
|
365
|
+
yield chunk
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def stream_response_for_long_request(
|
|
369
|
+
request_id: str,
|
|
370
|
+
logs_path: pathlib.Path,
|
|
371
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
372
|
+
kill_request_on_disconnect: bool = True,
|
|
373
|
+
) -> fastapi.responses.StreamingResponse:
|
|
374
|
+
"""Stream the logs of a long request."""
|
|
375
|
+
return stream_response(
|
|
376
|
+
request_id,
|
|
377
|
+
logs_path,
|
|
378
|
+
background_tasks,
|
|
379
|
+
polling_interval=LONG_REQUEST_POLL_INTERVAL,
|
|
380
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
381
|
+
)
|
|
121
382
|
|
|
122
383
|
|
|
123
384
|
def stream_response(
|
|
124
|
-
request_id: str,
|
|
125
|
-
|
|
385
|
+
request_id: str,
|
|
386
|
+
logs_path: pathlib.Path,
|
|
387
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
388
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL,
|
|
389
|
+
kill_request_on_disconnect: bool = True,
|
|
126
390
|
) -> fastapi.responses.StreamingResponse:
|
|
127
391
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
392
|
+
if kill_request_on_disconnect:
|
|
393
|
+
|
|
394
|
+
async def on_disconnect():
|
|
395
|
+
logger.info(f'User terminated the connection for request '
|
|
396
|
+
f'{request_id}')
|
|
397
|
+
await requests_lib.kill_request_async(request_id)
|
|
132
398
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
399
|
+
# The background task will be run after returning a response.
|
|
400
|
+
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
|
401
|
+
background_tasks.add_task(on_disconnect)
|
|
136
402
|
|
|
137
403
|
return fastapi.responses.StreamingResponse(
|
|
138
|
-
log_streamer(request_id, logs_path),
|
|
404
|
+
log_streamer(request_id, logs_path, polling_interval=polling_interval),
|
|
139
405
|
media_type='text/plain',
|
|
140
406
|
headers={
|
|
141
407
|
'Cache-Control': 'no-cache, no-transform',
|