skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>SkyPilot API Server Login</title>
|
|
7
|
+
<style>
|
|
8
|
+
body {
|
|
9
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
|
10
|
+
display: flex;
|
|
11
|
+
flex-direction: column;
|
|
12
|
+
align-items: center;
|
|
13
|
+
justify-content: center;
|
|
14
|
+
min-height: 100vh;
|
|
15
|
+
margin: 0;
|
|
16
|
+
background-color: #f8f9fa;
|
|
17
|
+
color: #202124;
|
|
18
|
+
padding: 20px;
|
|
19
|
+
box-sizing: border-box;
|
|
20
|
+
}
|
|
21
|
+
.container {
|
|
22
|
+
background-color: #ffffff;
|
|
23
|
+
padding: 48px;
|
|
24
|
+
border-radius: 8px;
|
|
25
|
+
box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
|
|
26
|
+
text-align: center;
|
|
27
|
+
max-width: 600px;
|
|
28
|
+
width: 100%;
|
|
29
|
+
}
|
|
30
|
+
.logo {
|
|
31
|
+
width: 64px;
|
|
32
|
+
height: 64px;
|
|
33
|
+
margin-bottom: 20px;
|
|
34
|
+
display: inline-block;
|
|
35
|
+
}
|
|
36
|
+
.logo svg {
|
|
37
|
+
width: 100%;
|
|
38
|
+
height: 100%;
|
|
39
|
+
}
|
|
40
|
+
h1 {
|
|
41
|
+
font-size: 24px;
|
|
42
|
+
font-weight: 500;
|
|
43
|
+
margin-bottom: 20px;
|
|
44
|
+
color: #202124;
|
|
45
|
+
}
|
|
46
|
+
p {
|
|
47
|
+
font-size: 14px;
|
|
48
|
+
line-height: 1.5;
|
|
49
|
+
margin-bottom: 20px;
|
|
50
|
+
color: #5f6368;
|
|
51
|
+
}
|
|
52
|
+
.user-identifier {
|
|
53
|
+
font-size: 12px; /* Smaller font size */
|
|
54
|
+
color: #80868b; /* Lighter color */
|
|
55
|
+
margin-bottom: 8px; /* Adjusted margin */
|
|
56
|
+
}
|
|
57
|
+
.code-block {
|
|
58
|
+
background-color: #f1f3f4;
|
|
59
|
+
border: 1px solid #dadce0;
|
|
60
|
+
border-radius: 4px;
|
|
61
|
+
padding: 16px;
|
|
62
|
+
margin-top: 24px;
|
|
63
|
+
margin-bottom: 24px;
|
|
64
|
+
margin-left: auto;
|
|
65
|
+
margin-right: auto;
|
|
66
|
+
text-align: left;
|
|
67
|
+
word-break: break-all;
|
|
68
|
+
white-space: pre-wrap;
|
|
69
|
+
font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
|
|
70
|
+
font-size: 13px;
|
|
71
|
+
line-height: 1.4;
|
|
72
|
+
max-width: 480px;
|
|
73
|
+
}
|
|
74
|
+
#token-box { /* Specifically for the token */
|
|
75
|
+
height: auto;
|
|
76
|
+
min-height: 6em; /* Ensure it's a reasonable size */
|
|
77
|
+
max-height: 15em; /* Prevent it from getting too large */
|
|
78
|
+
overflow-y: auto;
|
|
79
|
+
}
|
|
80
|
+
.copy-button {
|
|
81
|
+
background-color: #1a73e8;
|
|
82
|
+
color: white;
|
|
83
|
+
border: none;
|
|
84
|
+
border-radius: 4px;
|
|
85
|
+
padding: 10px 24px;
|
|
86
|
+
font-size: 14px;
|
|
87
|
+
font-weight: 500;
|
|
88
|
+
cursor: pointer;
|
|
89
|
+
transition: background-color 0.3s;
|
|
90
|
+
margin-top: 10px;
|
|
91
|
+
}
|
|
92
|
+
.copy-button:hover {
|
|
93
|
+
background-color: #287ae6;
|
|
94
|
+
}
|
|
95
|
+
.copy-button:active {
|
|
96
|
+
background-color: #1b66c9;
|
|
97
|
+
}
|
|
98
|
+
.footer-text {
|
|
99
|
+
font-size: 12px;
|
|
100
|
+
color: #5f6368;
|
|
101
|
+
margin-top: 30px;
|
|
102
|
+
}
|
|
103
|
+
.local-port-info {
|
|
104
|
+
display: none;
|
|
105
|
+
}
|
|
106
|
+
</style>
|
|
107
|
+
</head>
|
|
108
|
+
<body>
|
|
109
|
+
<div class="container">
|
|
110
|
+
<div class="logo">
|
|
111
|
+
<!-- SkyPilot Logo Icon -->
|
|
112
|
+
<svg viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
113
|
+
<path d="M25.1258 30.8274L19.2842 31.6783L33.8316 46.2268L31.492 37.1925L25.1258 30.8274Z" fill="#372F8A"/>
|
|
114
|
+
<path d="M46.9433 0.000976562L0.719727 13.1148L15.2661 27.6601L16.633 21.3925L10.3728 15.1323L40.183 6.74118C40.183 6.74118 46.102 0.855027 46.9444 0.00203721L46.9433 0.000976562Z" fill="#372F8A"/>
|
|
115
|
+
<path d="M40.1821 6.74021L31.4922 37.1925L33.8318 46.2257L46.9445 0C46.1022 0.85299 40.1831 6.73915 40.1831 6.73915L40.1821 6.74021Z" fill="#372F8A"/>
|
|
116
|
+
<path d="M21.3356 25.6089L19.2842 31.6783L25.1258 30.8275L30.3741 16.6011L30.3275 16.617L21.3356 25.6089Z" fill="#195D7F"/>
|
|
117
|
+
<path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
|
|
118
|
+
</svg>
|
|
119
|
+
</div>
|
|
120
|
+
<h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
|
|
121
|
+
<h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
|
|
122
|
+
<p class="user-identifier">USER_PLACEHOLDER</p>
|
|
123
|
+
<!-- display token info by default -->
|
|
124
|
+
<p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
|
|
125
|
+
<p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
|
|
126
|
+
<div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
|
|
127
|
+
<button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
|
|
128
|
+
<p class="footer-text no-local-port">You can close this tab after copying the token.</p>
|
|
129
|
+
|
|
130
|
+
<!-- don't display local port info unless successful -->
|
|
131
|
+
<p class="local-port-info">You can now close this tab.</p>
|
|
132
|
+
</div>
|
|
133
|
+
|
|
134
|
+
<script>
|
|
135
|
+
const tokenBox = document.getElementById('token-box');
|
|
136
|
+
const copyBtn = document.getElementById('copy-btn');
|
|
137
|
+
|
|
138
|
+
function selectToken() {
|
|
139
|
+
// For <pre> or <div>, create a range to select its content
|
|
140
|
+
const range = document.createRange();
|
|
141
|
+
range.selectNodeContents(tokenBox);
|
|
142
|
+
const sel = window.getSelection();
|
|
143
|
+
sel.removeAllRanges();
|
|
144
|
+
sel.addRange(range);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Optional: Select the token when the page loads or when token box is clicked
|
|
148
|
+
tokenBox.addEventListener('click', selectToken);
|
|
149
|
+
window.addEventListener('load', selectToken);
|
|
150
|
+
|
|
151
|
+
copyBtn.addEventListener('click', () => {
|
|
152
|
+
selectToken(); // Select the text
|
|
153
|
+
try {
|
|
154
|
+
document.execCommand('copy');
|
|
155
|
+
copyBtn.textContent = 'Copied!';
|
|
156
|
+
} catch (err) {
|
|
157
|
+
copyBtn.textContent = 'Error!';
|
|
158
|
+
console.error('Failed to copy text: ', err);
|
|
159
|
+
}
|
|
160
|
+
setTimeout(() => {
|
|
161
|
+
copyBtn.textContent = 'Copy Token';
|
|
162
|
+
}, 2000);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
function hideTokenInfo() {
|
|
166
|
+
const noLocalPortElems = document.querySelectorAll('.no-local-port');
|
|
167
|
+
noLocalPortElems.forEach(elem => {
|
|
168
|
+
elem.style.display = 'none';
|
|
169
|
+
});
|
|
170
|
+
const localPortInfoElems = document.querySelectorAll('.local-port-info');
|
|
171
|
+
localPortInfoElems.forEach(elem => {
|
|
172
|
+
elem.classList.remove('local-port-info');
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if (window.location.search.includes('local_port=')) {
|
|
177
|
+
const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
|
|
178
|
+
fetch(uri, {
|
|
179
|
+
method: 'POST',
|
|
180
|
+
body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
|
|
181
|
+
}).then(hideTokenInfo)
|
|
182
|
+
}
|
|
183
|
+
</script>
|
|
184
|
+
</body>
|
|
185
|
+
</html>
|
sky/server/metrics.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Instrumentation for the API server."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
import fastapi
|
|
11
|
+
from prometheus_client import generate_latest
|
|
12
|
+
from prometheus_client import multiprocess
|
|
13
|
+
import prometheus_client as prom
|
|
14
|
+
import psutil
|
|
15
|
+
import starlette.middleware.base
|
|
16
|
+
import uvicorn
|
|
17
|
+
|
|
18
|
+
from sky import core
|
|
19
|
+
from sky import sky_logging
|
|
20
|
+
from sky.metrics import utils as metrics_utils
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
metrics_app = fastapi.FastAPI()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Serve /metrics in dedicated thread to avoid blocking the event loop
|
|
28
|
+
# of metrics server.
|
|
29
|
+
@metrics_app.get('/metrics')
|
|
30
|
+
def metrics() -> fastapi.Response:
|
|
31
|
+
"""Expose aggregated Prometheus metrics from all worker processes."""
|
|
32
|
+
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
|
33
|
+
# In multiprocess mode, we need to collect metrics from all processes.
|
|
34
|
+
registry = prom.CollectorRegistry()
|
|
35
|
+
multiprocess.MultiProcessCollector(registry)
|
|
36
|
+
data = generate_latest(registry)
|
|
37
|
+
else:
|
|
38
|
+
data = generate_latest()
|
|
39
|
+
return fastapi.Response(content=data,
|
|
40
|
+
media_type=prom.CONTENT_TYPE_LATEST,
|
|
41
|
+
headers={'Cache-Control': 'no-cache'})
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@metrics_app.get('/gpu-metrics')
|
|
45
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
46
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
47
|
+
contexts = core.get_all_contexts()
|
|
48
|
+
all_metrics: List[str] = []
|
|
49
|
+
successful_contexts = 0
|
|
50
|
+
|
|
51
|
+
tasks = [
|
|
52
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
53
|
+
for context in contexts
|
|
54
|
+
if context != 'in-cluster'
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
58
|
+
|
|
59
|
+
for i, result in enumerate(results):
|
|
60
|
+
if isinstance(result, Exception):
|
|
61
|
+
logger.error(
|
|
62
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
63
|
+
elif isinstance(result, BaseException):
|
|
64
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
65
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
66
|
+
raise result
|
|
67
|
+
else:
|
|
68
|
+
metrics_text = result
|
|
69
|
+
all_metrics.append(metrics_text)
|
|
70
|
+
successful_contexts += 1
|
|
71
|
+
|
|
72
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
73
|
+
|
|
74
|
+
# Return as plain text for Prometheus compatibility
|
|
75
|
+
return fastapi.Response(
|
|
76
|
+
content=combined_metrics,
|
|
77
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
|
81
|
+
metrics_config = uvicorn.Config(
|
|
82
|
+
'sky.server.metrics:metrics_app',
|
|
83
|
+
host=host,
|
|
84
|
+
port=port,
|
|
85
|
+
workers=1,
|
|
86
|
+
)
|
|
87
|
+
metrics_server_instance = uvicorn.Server(metrics_config)
|
|
88
|
+
return metrics_server_instance
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_status_code_group(status_code: int) -> str:
|
|
92
|
+
"""Group status codes into classes (2xx, 5xx) to reduce cardinality."""
|
|
93
|
+
return f'{status_code // 100}xx'
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _is_streaming_api(path: str) -> bool:
|
|
97
|
+
"""Check if the path is a streaming API."""
|
|
98
|
+
path = path.rstrip('/')
|
|
99
|
+
return path.endswith('/logs') or path.endswith('/api/stream')
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
103
|
+
"""Middleware to collect Prometheus metrics for HTTP requests."""
|
|
104
|
+
|
|
105
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
|
106
|
+
path = request.url.path
|
|
107
|
+
logger.debug(f'PROM Middleware Request: {request}, {request.url.path}')
|
|
108
|
+
streaming = _is_streaming_api(path)
|
|
109
|
+
if not streaming:
|
|
110
|
+
# Exclude streaming APIs, the duration is not meaningful.
|
|
111
|
+
# TODO(aylei): measure the duration of async execution instead.
|
|
112
|
+
start_time = time.time()
|
|
113
|
+
method = request.method
|
|
114
|
+
status_code_group = ''
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
response = await call_next(request)
|
|
118
|
+
status_code_group = _get_status_code_group(response.status_code)
|
|
119
|
+
except Exception: # pylint: disable=broad-except
|
|
120
|
+
status_code_group = '5xx'
|
|
121
|
+
raise
|
|
122
|
+
finally:
|
|
123
|
+
metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
124
|
+
path=path, method=method, status=status_code_group).inc()
|
|
125
|
+
if not streaming:
|
|
126
|
+
duration = time.time() - start_time
|
|
127
|
+
metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
128
|
+
path=path, method=method,
|
|
129
|
+
status=status_code_group).observe(duration)
|
|
130
|
+
|
|
131
|
+
return response
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
peak_rss_bytes = 0
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
138
|
+
pid = multiprocessing.current_process().pid
|
|
139
|
+
proc = psutil.Process(pid)
|
|
140
|
+
last_bucket_end = time.time()
|
|
141
|
+
bucket_peak = 0
|
|
142
|
+
global peak_rss_bytes
|
|
143
|
+
while not stop.is_set():
|
|
144
|
+
if time.time() - last_bucket_end >= 30:
|
|
145
|
+
# Reset peak RSS for the next time bucket.
|
|
146
|
+
last_bucket_end = time.time()
|
|
147
|
+
bucket_peak = 0
|
|
148
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
149
|
+
metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
150
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
151
|
+
ctimes = proc.cpu_times()
|
|
152
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
153
|
+
type=process_type,
|
|
154
|
+
mode='user').set(
|
|
155
|
+
ctimes.user)
|
|
156
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
157
|
+
type=process_type,
|
|
158
|
+
mode='system').set(
|
|
159
|
+
ctimes.system)
|
|
160
|
+
time.sleep(1)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Utilities for building middlewares."""
|
|
2
|
+
import enum
|
|
3
|
+
import http
|
|
4
|
+
from typing import Type
|
|
5
|
+
|
|
6
|
+
import fastapi
|
|
7
|
+
import starlette.middleware.base
|
|
8
|
+
import starlette.types
|
|
9
|
+
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
|
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WebSocketDecision(enum.Enum):
|
|
16
|
+
ACCEPT = 'accept'
|
|
17
|
+
UNAUTHORIZED = 'unauthorized'
|
|
18
|
+
FORBIDDEN = 'forbidden'
|
|
19
|
+
ERROR = 'error'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def websocket_aware(
|
|
23
|
+
middleware_cls: Type[starlette.middleware.base.BaseHTTPMiddleware]):
|
|
24
|
+
"""Decorator to adapt BaseHTTPMiddleware to handle WebSockets.
|
|
25
|
+
|
|
26
|
+
It assembles an HTTP-style request like the HTTP upgrade request during
|
|
27
|
+
websocket handshake and then delegates it to the real HTTP middleware.
|
|
28
|
+
The websocket connection will be rejected if the HTTP middleware returns
|
|
29
|
+
a 4xx or 5xx status code.
|
|
30
|
+
|
|
31
|
+
Note: for websocket connection, the mutation made by the underlying HTTP
|
|
32
|
+
middleware on the request and response will be discarded.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
class WebSocketAwareMiddleware:
|
|
36
|
+
"""WebSocket-aware middleware wrapper."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, app: starlette.types.ASGIApp, *args, **kwargs):
|
|
39
|
+
self.app = app
|
|
40
|
+
self.middleware = middleware_cls(app, *args, **kwargs)
|
|
41
|
+
|
|
42
|
+
async def __call__(self, scope: starlette.types.Scope,
|
|
43
|
+
receive: starlette.types.Receive,
|
|
44
|
+
send: starlette.types.Send):
|
|
45
|
+
scope_type = scope.get('type')
|
|
46
|
+
if scope_type == 'websocket':
|
|
47
|
+
await self._handle_websocket(scope, receive, send)
|
|
48
|
+
else:
|
|
49
|
+
# Delegate other scopes to the underlying HTTP middleware.
|
|
50
|
+
await self.middleware(scope, receive, send)
|
|
51
|
+
|
|
52
|
+
async def dispatch(
|
|
53
|
+
self, request: fastapi.Request,
|
|
54
|
+
call_next: starlette.middleware.base.RequestResponseEndpoint):
|
|
55
|
+
"""Implement dispatch method to keep compatibility."""
|
|
56
|
+
return await self.middleware.dispatch(request, call_next)
|
|
57
|
+
|
|
58
|
+
async def _handle_websocket(self, scope: starlette.types.Scope,
|
|
59
|
+
receive: starlette.types.Receive,
|
|
60
|
+
send: starlette.types.Send):
|
|
61
|
+
"""Handle websocket connection by delegating to HTTP middleware."""
|
|
62
|
+
decision = await self._run_websocket_dispatch(scope)
|
|
63
|
+
if decision == WebSocketDecision.ACCEPT:
|
|
64
|
+
await self.app(scope, receive, send)
|
|
65
|
+
elif decision == WebSocketDecision.UNAUTHORIZED:
|
|
66
|
+
await send({
|
|
67
|
+
'type': 'websocket.close',
|
|
68
|
+
'code': 4401,
|
|
69
|
+
'reason': 'Unauthorized',
|
|
70
|
+
})
|
|
71
|
+
elif decision == WebSocketDecision.FORBIDDEN:
|
|
72
|
+
await send({
|
|
73
|
+
'type': 'websocket.close',
|
|
74
|
+
'code': 4403,
|
|
75
|
+
'reason': 'Forbidden',
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
await send({
|
|
79
|
+
'type': 'websocket.close',
|
|
80
|
+
'code': 1011,
|
|
81
|
+
'reason': 'Internal Server Error',
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
async def _run_websocket_dispatch(
|
|
85
|
+
self, scope: starlette.types.Scope) -> WebSocketDecision:
|
|
86
|
+
http_scope = self._build_http_scope(scope)
|
|
87
|
+
http_receive = self._http_receive_adapter()
|
|
88
|
+
request = fastapi.Request(http_scope, receive=http_receive)
|
|
89
|
+
call_next_called = False
|
|
90
|
+
stub_response = fastapi.Response(status_code=http.HTTPStatus.OK)
|
|
91
|
+
|
|
92
|
+
async def call_next(req):
|
|
93
|
+
del req
|
|
94
|
+
# Capture whether call_next() is called in the underlying
|
|
95
|
+
# HTTP middleware to determine if we can proceed with current
|
|
96
|
+
# websocket connection.
|
|
97
|
+
nonlocal call_next_called
|
|
98
|
+
call_next_called = True
|
|
99
|
+
return stub_response
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
response = await self.dispatch(request, call_next)
|
|
103
|
+
except Exception as e: # pylint: disable=broad-except
|
|
104
|
+
logger.error('Exception occurred in middleware dispatch for '
|
|
105
|
+
f'WebSocket scope: {e}')
|
|
106
|
+
return WebSocketDecision.ERROR
|
|
107
|
+
|
|
108
|
+
if response is None:
|
|
109
|
+
response = stub_response
|
|
110
|
+
|
|
111
|
+
status_code = response.status_code
|
|
112
|
+
|
|
113
|
+
if call_next_called and 200 <= status_code < 400:
|
|
114
|
+
return WebSocketDecision.ACCEPT
|
|
115
|
+
if status_code == http.HTTPStatus.UNAUTHORIZED:
|
|
116
|
+
return WebSocketDecision.UNAUTHORIZED
|
|
117
|
+
if status_code == http.HTTPStatus.FORBIDDEN:
|
|
118
|
+
return WebSocketDecision.FORBIDDEN
|
|
119
|
+
return WebSocketDecision.ERROR
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _build_http_scope(
|
|
123
|
+
scope: starlette.types.Scope) -> starlette.types.Scope:
|
|
124
|
+
state = scope.setdefault('state', {})
|
|
125
|
+
scheme = scope.get('scheme', 'ws')
|
|
126
|
+
if scheme == 'ws':
|
|
127
|
+
http_scheme = 'http'
|
|
128
|
+
elif scheme == 'wss':
|
|
129
|
+
http_scheme = 'https'
|
|
130
|
+
else:
|
|
131
|
+
http_scheme = scheme
|
|
132
|
+
http_scope = dict(scope)
|
|
133
|
+
http_scope['type'] = 'http'
|
|
134
|
+
http_scope['scheme'] = http_scheme
|
|
135
|
+
http_scope['method'] = 'GET'
|
|
136
|
+
http_scope['http_version'] = scope.get('http_version', '1.1')
|
|
137
|
+
http_scope['state'] = state
|
|
138
|
+
return http_scope
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _http_receive_adapter() -> starlette.types.Receive:
|
|
142
|
+
"""Adapter thatmimics the sequence produced by Starlette for an HTTP
|
|
143
|
+
request: a single http.request event followed by a http.disconnect
|
|
144
|
+
"""
|
|
145
|
+
sent = False
|
|
146
|
+
|
|
147
|
+
async def receive():
|
|
148
|
+
nonlocal sent
|
|
149
|
+
if not sent:
|
|
150
|
+
sent = True
|
|
151
|
+
return {
|
|
152
|
+
'type': 'http.request',
|
|
153
|
+
'body': b'',
|
|
154
|
+
'more_body': False,
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
'type': 'http.disconnect',
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return receive
|
|
161
|
+
|
|
162
|
+
WebSocketAwareMiddleware.__name__ = middleware_cls.__name__
|
|
163
|
+
WebSocketAwareMiddleware.__qualname__ = middleware_cls.__qualname__
|
|
164
|
+
WebSocketAwareMiddleware.__module__ = middleware_cls.__module__
|
|
165
|
+
WebSocketAwareMiddleware.__doc__ = middleware_cls.__doc__
|
|
166
|
+
return WebSocketAwareMiddleware
|