skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/metrics.py
CHANGED
|
@@ -1,74 +1,33 @@
|
|
|
1
1
|
"""Instrumentation for the API server."""
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import
|
|
3
|
+
import asyncio
|
|
4
|
+
import multiprocessing
|
|
5
5
|
import os
|
|
6
|
+
import threading
|
|
6
7
|
import time
|
|
8
|
+
from typing import List
|
|
7
9
|
|
|
8
10
|
import fastapi
|
|
9
11
|
from prometheus_client import generate_latest
|
|
10
12
|
from prometheus_client import multiprocess
|
|
11
13
|
import prometheus_client as prom
|
|
14
|
+
import psutil
|
|
12
15
|
import starlette.middleware.base
|
|
13
16
|
import uvicorn
|
|
14
17
|
|
|
18
|
+
from sky import core
|
|
15
19
|
from sky import sky_logging
|
|
16
|
-
from sky.
|
|
17
|
-
|
|
18
|
-
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
19
|
-
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
20
|
-
'false').lower() == 'true'
|
|
20
|
+
from sky.metrics import utils as metrics_utils
|
|
21
21
|
|
|
22
22
|
logger = sky_logging.init_logger(__name__)
|
|
23
23
|
|
|
24
|
-
# Total number of API server requests, grouped by path, method, and status.
|
|
25
|
-
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
26
|
-
'sky_apiserver_requests_total',
|
|
27
|
-
'Total number of API server requests',
|
|
28
|
-
['path', 'method', 'status'],
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
# Time spent processing API server requests, grouped by path, method, and
|
|
32
|
-
# status.
|
|
33
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
34
|
-
'sky_apiserver_request_duration_seconds',
|
|
35
|
-
'Time spent processing API server requests',
|
|
36
|
-
['path', 'method', 'status'],
|
|
37
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
38
|
-
60.0, 120.0, float('inf')),
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Time spent processing requests in executor.
|
|
42
|
-
SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
|
|
43
|
-
'sky_apiserver_request_execution_duration_seconds',
|
|
44
|
-
'Time spent executing requests in executor',
|
|
45
|
-
['request', 'worker'],
|
|
46
|
-
buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
|
|
47
|
-
float('inf')),
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
# Time spent processing a piece of code, refer to time_it().
|
|
51
|
-
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
52
|
-
'sky_apiserver_code_duration_seconds',
|
|
53
|
-
'Time spent processing code',
|
|
54
|
-
['name', 'group'],
|
|
55
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
56
|
-
60.0, 120.0, float('inf')),
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
60
|
-
'sky_apiserver_event_loop_lag_seconds',
|
|
61
|
-
'Scheduling delay of the server event loop',
|
|
62
|
-
['pid'],
|
|
63
|
-
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
64
|
-
60.0, float('inf')),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
24
|
metrics_app = fastapi.FastAPI()
|
|
68
25
|
|
|
69
26
|
|
|
27
|
+
# Serve /metrics in dedicated thread to avoid blocking the event loop
|
|
28
|
+
# of metrics server.
|
|
70
29
|
@metrics_app.get('/metrics')
|
|
71
|
-
|
|
30
|
+
def metrics() -> fastapi.Response:
|
|
72
31
|
"""Expose aggregated Prometheus metrics from all worker processes."""
|
|
73
32
|
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
|
74
33
|
# In multiprocess mode, we need to collect metrics from all processes.
|
|
@@ -82,6 +41,42 @@ async def metrics() -> fastapi.Response:
|
|
|
82
41
|
headers={'Cache-Control': 'no-cache'})
|
|
83
42
|
|
|
84
43
|
|
|
44
|
+
@metrics_app.get('/gpu-metrics')
|
|
45
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
46
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
47
|
+
contexts = core.get_all_contexts()
|
|
48
|
+
all_metrics: List[str] = []
|
|
49
|
+
successful_contexts = 0
|
|
50
|
+
|
|
51
|
+
tasks = [
|
|
52
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
53
|
+
for context in contexts
|
|
54
|
+
if context != 'in-cluster'
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
58
|
+
|
|
59
|
+
for i, result in enumerate(results):
|
|
60
|
+
if isinstance(result, Exception):
|
|
61
|
+
logger.error(
|
|
62
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
63
|
+
elif isinstance(result, BaseException):
|
|
64
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
65
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
66
|
+
raise result
|
|
67
|
+
else:
|
|
68
|
+
metrics_text = result
|
|
69
|
+
all_metrics.append(metrics_text)
|
|
70
|
+
successful_contexts += 1
|
|
71
|
+
|
|
72
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
73
|
+
|
|
74
|
+
# Return as plain text for Prometheus compatibility
|
|
75
|
+
return fastapi.Response(
|
|
76
|
+
content=combined_metrics,
|
|
77
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
78
|
+
|
|
79
|
+
|
|
85
80
|
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
|
86
81
|
metrics_config = uvicorn.Config(
|
|
87
82
|
'sky.server.metrics:metrics_app',
|
|
@@ -125,56 +120,41 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
125
120
|
status_code_group = '5xx'
|
|
126
121
|
raise
|
|
127
122
|
finally:
|
|
128
|
-
SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
129
|
-
|
|
130
|
-
status=status_code_group).inc()
|
|
123
|
+
metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
124
|
+
path=path, method=method, status=status_code_group).inc()
|
|
131
125
|
if not streaming:
|
|
132
126
|
duration = time.time() - start_time
|
|
133
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
127
|
+
metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
134
128
|
path=path, method=method,
|
|
135
129
|
status=status_code_group).observe(duration)
|
|
136
130
|
|
|
137
131
|
return response
|
|
138
132
|
|
|
139
133
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def time_me_async(func):
|
|
170
|
-
"""Measure the duration of decorated async function."""
|
|
171
|
-
|
|
172
|
-
@functools.wraps(func)
|
|
173
|
-
async def async_wrapper(*args, **kwargs):
|
|
174
|
-
if not METRICS_ENABLED:
|
|
175
|
-
return await func(*args, **kwargs)
|
|
176
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
177
|
-
with time_it(name, group='function'):
|
|
178
|
-
return await func(*args, **kwargs)
|
|
179
|
-
|
|
180
|
-
return async_wrapper
|
|
134
|
+
peak_rss_bytes = 0
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
138
|
+
pid = multiprocessing.current_process().pid
|
|
139
|
+
proc = psutil.Process(pid)
|
|
140
|
+
last_bucket_end = time.time()
|
|
141
|
+
bucket_peak = 0
|
|
142
|
+
global peak_rss_bytes
|
|
143
|
+
while not stop.is_set():
|
|
144
|
+
if time.time() - last_bucket_end >= 30:
|
|
145
|
+
# Reset peak RSS for the next time bucket.
|
|
146
|
+
last_bucket_end = time.time()
|
|
147
|
+
bucket_peak = 0
|
|
148
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
149
|
+
metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
150
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
151
|
+
ctimes = proc.cpu_times()
|
|
152
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
153
|
+
type=process_type,
|
|
154
|
+
mode='user').set(
|
|
155
|
+
ctimes.user)
|
|
156
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
157
|
+
type=process_type,
|
|
158
|
+
mode='system').set(
|
|
159
|
+
ctimes.system)
|
|
160
|
+
time.sleep(1)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Utilities for building middlewares."""
|
|
2
|
+
import enum
|
|
3
|
+
import http
|
|
4
|
+
from typing import Type
|
|
5
|
+
|
|
6
|
+
import fastapi
|
|
7
|
+
import starlette.middleware.base
|
|
8
|
+
import starlette.types
|
|
9
|
+
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
|
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WebSocketDecision(enum.Enum):
|
|
16
|
+
ACCEPT = 'accept'
|
|
17
|
+
UNAUTHORIZED = 'unauthorized'
|
|
18
|
+
FORBIDDEN = 'forbidden'
|
|
19
|
+
ERROR = 'error'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def websocket_aware(
|
|
23
|
+
middleware_cls: Type[starlette.middleware.base.BaseHTTPMiddleware]):
|
|
24
|
+
"""Decorator to adapt BaseHTTPMiddleware to handle WebSockets.
|
|
25
|
+
|
|
26
|
+
It assembles an HTTP-style request like the HTTP upgrade request during
|
|
27
|
+
websocket handshake and then delegates it to the real HTTP middleware.
|
|
28
|
+
The websocket connection will be rejected if the HTTP middleware returns
|
|
29
|
+
a 4xx or 5xx status code.
|
|
30
|
+
|
|
31
|
+
Note: for websocket connection, the mutation made by the underlying HTTP
|
|
32
|
+
middleware on the request and response will be discarded.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
class WebSocketAwareMiddleware:
|
|
36
|
+
"""WebSocket-aware middleware wrapper."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, app: starlette.types.ASGIApp, *args, **kwargs):
|
|
39
|
+
self.app = app
|
|
40
|
+
self.middleware = middleware_cls(app, *args, **kwargs)
|
|
41
|
+
|
|
42
|
+
async def __call__(self, scope: starlette.types.Scope,
|
|
43
|
+
receive: starlette.types.Receive,
|
|
44
|
+
send: starlette.types.Send):
|
|
45
|
+
scope_type = scope.get('type')
|
|
46
|
+
if scope_type == 'websocket':
|
|
47
|
+
await self._handle_websocket(scope, receive, send)
|
|
48
|
+
else:
|
|
49
|
+
# Delegate other scopes to the underlying HTTP middleware.
|
|
50
|
+
await self.middleware(scope, receive, send)
|
|
51
|
+
|
|
52
|
+
async def dispatch(
|
|
53
|
+
self, request: fastapi.Request,
|
|
54
|
+
call_next: starlette.middleware.base.RequestResponseEndpoint):
|
|
55
|
+
"""Implement dispatch method to keep compatibility."""
|
|
56
|
+
return await self.middleware.dispatch(request, call_next)
|
|
57
|
+
|
|
58
|
+
async def _handle_websocket(self, scope: starlette.types.Scope,
|
|
59
|
+
receive: starlette.types.Receive,
|
|
60
|
+
send: starlette.types.Send):
|
|
61
|
+
"""Handle websocket connection by delegating to HTTP middleware."""
|
|
62
|
+
decision = await self._run_websocket_dispatch(scope)
|
|
63
|
+
if decision == WebSocketDecision.ACCEPT:
|
|
64
|
+
await self.app(scope, receive, send)
|
|
65
|
+
elif decision == WebSocketDecision.UNAUTHORIZED:
|
|
66
|
+
await send({
|
|
67
|
+
'type': 'websocket.close',
|
|
68
|
+
'code': 4401,
|
|
69
|
+
'reason': 'Unauthorized',
|
|
70
|
+
})
|
|
71
|
+
elif decision == WebSocketDecision.FORBIDDEN:
|
|
72
|
+
await send({
|
|
73
|
+
'type': 'websocket.close',
|
|
74
|
+
'code': 4403,
|
|
75
|
+
'reason': 'Forbidden',
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
await send({
|
|
79
|
+
'type': 'websocket.close',
|
|
80
|
+
'code': 1011,
|
|
81
|
+
'reason': 'Internal Server Error',
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
async def _run_websocket_dispatch(
|
|
85
|
+
self, scope: starlette.types.Scope) -> WebSocketDecision:
|
|
86
|
+
http_scope = self._build_http_scope(scope)
|
|
87
|
+
http_receive = self._http_receive_adapter()
|
|
88
|
+
request = fastapi.Request(http_scope, receive=http_receive)
|
|
89
|
+
call_next_called = False
|
|
90
|
+
stub_response = fastapi.Response(status_code=http.HTTPStatus.OK)
|
|
91
|
+
|
|
92
|
+
async def call_next(req):
|
|
93
|
+
del req
|
|
94
|
+
# Capture whether call_next() is called in the underlying
|
|
95
|
+
# HTTP middleware to determine if we can proceed with current
|
|
96
|
+
# websocket connection.
|
|
97
|
+
nonlocal call_next_called
|
|
98
|
+
call_next_called = True
|
|
99
|
+
return stub_response
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
response = await self.dispatch(request, call_next)
|
|
103
|
+
except Exception as e: # pylint: disable=broad-except
|
|
104
|
+
logger.error('Exception occurred in middleware dispatch for '
|
|
105
|
+
f'WebSocket scope: {e}')
|
|
106
|
+
return WebSocketDecision.ERROR
|
|
107
|
+
|
|
108
|
+
if response is None:
|
|
109
|
+
response = stub_response
|
|
110
|
+
|
|
111
|
+
status_code = response.status_code
|
|
112
|
+
|
|
113
|
+
if call_next_called and 200 <= status_code < 400:
|
|
114
|
+
return WebSocketDecision.ACCEPT
|
|
115
|
+
if status_code == http.HTTPStatus.UNAUTHORIZED:
|
|
116
|
+
return WebSocketDecision.UNAUTHORIZED
|
|
117
|
+
if status_code == http.HTTPStatus.FORBIDDEN:
|
|
118
|
+
return WebSocketDecision.FORBIDDEN
|
|
119
|
+
return WebSocketDecision.ERROR
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _build_http_scope(
|
|
123
|
+
scope: starlette.types.Scope) -> starlette.types.Scope:
|
|
124
|
+
state = scope.setdefault('state', {})
|
|
125
|
+
scheme = scope.get('scheme', 'ws')
|
|
126
|
+
if scheme == 'ws':
|
|
127
|
+
http_scheme = 'http'
|
|
128
|
+
elif scheme == 'wss':
|
|
129
|
+
http_scheme = 'https'
|
|
130
|
+
else:
|
|
131
|
+
http_scheme = scheme
|
|
132
|
+
http_scope = dict(scope)
|
|
133
|
+
http_scope['type'] = 'http'
|
|
134
|
+
http_scope['scheme'] = http_scheme
|
|
135
|
+
http_scope['method'] = 'GET'
|
|
136
|
+
http_scope['http_version'] = scope.get('http_version', '1.1')
|
|
137
|
+
http_scope['state'] = state
|
|
138
|
+
return http_scope
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _http_receive_adapter() -> starlette.types.Receive:
|
|
142
|
+
"""Adapter thatmimics the sequence produced by Starlette for an HTTP
|
|
143
|
+
request: a single http.request event followed by a http.disconnect
|
|
144
|
+
"""
|
|
145
|
+
sent = False
|
|
146
|
+
|
|
147
|
+
async def receive():
|
|
148
|
+
nonlocal sent
|
|
149
|
+
if not sent:
|
|
150
|
+
sent = True
|
|
151
|
+
return {
|
|
152
|
+
'type': 'http.request',
|
|
153
|
+
'body': b'',
|
|
154
|
+
'more_body': False,
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
'type': 'http.disconnect',
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return receive
|
|
161
|
+
|
|
162
|
+
WebSocketAwareMiddleware.__name__ = middleware_cls.__name__
|
|
163
|
+
WebSocketAwareMiddleware.__qualname__ = middleware_cls.__qualname__
|
|
164
|
+
WebSocketAwareMiddleware.__module__ = middleware_cls.__module__
|
|
165
|
+
WebSocketAwareMiddleware.__doc__ = middleware_cls.__doc__
|
|
166
|
+
return WebSocketAwareMiddleware
|