skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/metrics/utils.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import select
|
|
7
|
+
import subprocess
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky import sky_logging
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.utils import common_utils
|
|
17
|
+
from sky.utils import context_utils
|
|
18
|
+
|
|
19
|
+
_SELECT_TIMEOUT = 1
|
|
20
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
21
|
+
|
|
22
|
+
_KB = 2**10
|
|
23
|
+
_MB = 2**20
|
|
24
|
+
_MEM_BUCKETS = [
|
|
25
|
+
_KB,
|
|
26
|
+
256 * _KB,
|
|
27
|
+
512 * _KB,
|
|
28
|
+
_MB,
|
|
29
|
+
2 * _MB,
|
|
30
|
+
4 * _MB,
|
|
31
|
+
8 * _MB,
|
|
32
|
+
16 * _MB,
|
|
33
|
+
32 * _MB,
|
|
34
|
+
64 * _MB,
|
|
35
|
+
128 * _MB,
|
|
36
|
+
256 * _MB,
|
|
37
|
+
float('inf'),
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
41
|
+
|
|
42
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
43
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
44
|
+
'false').lower() == 'true'
|
|
45
|
+
|
|
46
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
47
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
48
|
+
'sky_apiserver_code_duration_seconds',
|
|
49
|
+
'Time spent processing code',
|
|
50
|
+
['name', 'group'],
|
|
51
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
52
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
53
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
54
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
55
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
56
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
57
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
58
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
59
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
63
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
64
|
+
'sky_apiserver_requests_total',
|
|
65
|
+
'Total number of API server requests',
|
|
66
|
+
['path', 'method', 'status'],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
70
|
+
# status.
|
|
71
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
72
|
+
'sky_apiserver_request_duration_seconds',
|
|
73
|
+
'Time spent processing API server requests',
|
|
74
|
+
['path', 'method', 'status'],
|
|
75
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
76
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
77
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
78
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
79
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
80
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
81
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
82
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
83
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
87
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
88
|
+
'Scheduling delay of the server event loop',
|
|
89
|
+
['pid'],
|
|
90
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
91
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
92
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
93
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
94
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
95
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
96
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
97
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
98
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
102
|
+
'sky_apiserver_websocket_connections',
|
|
103
|
+
'Number of websocket connections',
|
|
104
|
+
['pid'],
|
|
105
|
+
multiprocess_mode='livesum',
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
109
|
+
'sky_apiserver_websocket_closed_total',
|
|
110
|
+
'Number of websocket closed',
|
|
111
|
+
['pid', 'reason'],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# The number of execution starts in each worker process, we do not record
|
|
115
|
+
# histogram here as the duration has been measured in
|
|
116
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
117
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
118
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
119
|
+
'sky_apiserver_process_execution_start_total',
|
|
120
|
+
'Total number of execution starts in each worker process',
|
|
121
|
+
['request', 'pid'],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
125
|
+
'sky_apiserver_process_peak_rss',
|
|
126
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
127
|
+
['pid', 'type'],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
131
|
+
'sky_apiserver_process_cpu_total',
|
|
132
|
+
'Total CPU times a worker process has been running',
|
|
133
|
+
['pid', 'type', 'mode'],
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
137
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
138
|
+
'Peak memory usage of requests', ['name'],
|
|
139
|
+
buckets=_MEM_BUCKETS)
|
|
140
|
+
|
|
141
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
142
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
143
|
+
'RSS increment after requests', ['name'],
|
|
144
|
+
buckets=_MEM_BUCKETS)
|
|
145
|
+
|
|
146
|
+
SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
|
|
147
|
+
'sky_apiserver_websocket_ssh_latency_seconds',
|
|
148
|
+
('Time taken for ssh message to go from client to API server and back'
|
|
149
|
+
'to the client. This does not include: latency to reach the pod, '
|
|
150
|
+
'overhead from sending through the k8s port-forward tunnel, or '
|
|
151
|
+
'ssh server lag on the destination pod.'),
|
|
152
|
+
['pid'],
|
|
153
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
154
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
155
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
156
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
157
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
158
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
159
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
160
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
161
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
|
|
165
|
+
'sky_apiserver_long_executors',
|
|
166
|
+
'Total number of long-running request executors in the API server',
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
|
|
170
|
+
'sky_apiserver_short_executors',
|
|
171
|
+
'Total number of short-running request executors in the API server',
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@contextlib.contextmanager
|
|
176
|
+
def time_it(name: str, group: str = 'default'):
|
|
177
|
+
"""Context manager to measure and record code execution duration."""
|
|
178
|
+
if not METRICS_ENABLED:
|
|
179
|
+
yield
|
|
180
|
+
else:
|
|
181
|
+
start_time = time.time()
|
|
182
|
+
try:
|
|
183
|
+
yield
|
|
184
|
+
finally:
|
|
185
|
+
duration = time.time() - start_time
|
|
186
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
187
|
+
name=name, group=group).observe(duration)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def time_me(func):
|
|
191
|
+
"""Measure the duration of decorated function."""
|
|
192
|
+
|
|
193
|
+
@functools.wraps(func)
|
|
194
|
+
def wrapper(*args, **kwargs):
|
|
195
|
+
if not METRICS_ENABLED:
|
|
196
|
+
return func(*args, **kwargs)
|
|
197
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
198
|
+
with time_it(name, group='function'):
|
|
199
|
+
return func(*args, **kwargs)
|
|
200
|
+
|
|
201
|
+
return wrapper
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def time_me_async(func):
|
|
205
|
+
"""Measure the duration of decorated async function."""
|
|
206
|
+
|
|
207
|
+
@functools.wraps(func)
|
|
208
|
+
async def async_wrapper(*args, **kwargs):
|
|
209
|
+
if not METRICS_ENABLED:
|
|
210
|
+
return await func(*args, **kwargs)
|
|
211
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
212
|
+
with time_it(name, group='function'):
|
|
213
|
+
return await func(*args, **kwargs)
|
|
214
|
+
|
|
215
|
+
return async_wrapper
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
219
|
+
service_port: int) -> Tuple[subprocess.Popen, int]:
|
|
220
|
+
"""Starts a port forward to a service in a Kubernetes cluster.
|
|
221
|
+
Args:
|
|
222
|
+
context: Kubernetes context name
|
|
223
|
+
namespace: Namespace where the service is located
|
|
224
|
+
service: Service name to port forward to
|
|
225
|
+
service_port: Port on the service to forward to
|
|
226
|
+
Returns:
|
|
227
|
+
Tuple of (subprocess.Popen process, local_port assigned)
|
|
228
|
+
Raises:
|
|
229
|
+
RuntimeError: If port forward fails to start
|
|
230
|
+
"""
|
|
231
|
+
start_port_forward_timeout = 10 # 10 second timeout
|
|
232
|
+
terminate_port_forward_timeout = 5 # 5 second timeout
|
|
233
|
+
|
|
234
|
+
# Use ':service_port' to let kubectl choose the local port
|
|
235
|
+
cmd = [
|
|
236
|
+
'kubectl', '--context', context, '-n', namespace, 'port-forward',
|
|
237
|
+
f'service/{service}', f':{service_port}'
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
env = os.environ.copy()
|
|
241
|
+
if 'KUBECONFIG' not in env:
|
|
242
|
+
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
|
243
|
+
|
|
244
|
+
port_forward_process = None
|
|
245
|
+
port_forward_exit = False
|
|
246
|
+
local_port = None
|
|
247
|
+
poller = None
|
|
248
|
+
fd = None
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
# start the port forward process
|
|
252
|
+
port_forward_process = subprocess.Popen(cmd,
|
|
253
|
+
stdout=subprocess.PIPE,
|
|
254
|
+
stderr=subprocess.STDOUT,
|
|
255
|
+
text=True,
|
|
256
|
+
env=env)
|
|
257
|
+
|
|
258
|
+
# Use poll() instead of select() to avoid FD_SETSIZE limit
|
|
259
|
+
poller = select.poll()
|
|
260
|
+
assert port_forward_process.stdout is not None
|
|
261
|
+
fd = port_forward_process.stdout.fileno()
|
|
262
|
+
poller.register(fd, select.POLLIN)
|
|
263
|
+
|
|
264
|
+
start_time = time.time()
|
|
265
|
+
buffer = ''
|
|
266
|
+
# wait for the port forward to start and extract the local port
|
|
267
|
+
while time.time() - start_time < start_port_forward_timeout:
|
|
268
|
+
if port_forward_process.poll() is not None:
|
|
269
|
+
# port forward process has terminated
|
|
270
|
+
if port_forward_process.returncode != 0:
|
|
271
|
+
port_forward_exit = True
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
# Wait up to 1000ms for data to be available without blocking
|
|
275
|
+
# poll() takes timeout in milliseconds
|
|
276
|
+
events = poller.poll(_SELECT_TIMEOUT * 1000)
|
|
277
|
+
|
|
278
|
+
if events:
|
|
279
|
+
# Read available bytes from the FD without blocking
|
|
280
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
281
|
+
chunk = raw.decode(errors='ignore')
|
|
282
|
+
buffer += chunk
|
|
283
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
284
|
+
if match:
|
|
285
|
+
local_port = int(match.group(1))
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
# sleep for 100ms to avoid busy-waiting
|
|
289
|
+
time.sleep(0.1)
|
|
290
|
+
except BaseException: # pylint: disable=broad-exception-caught
|
|
291
|
+
if port_forward_process:
|
|
292
|
+
stop_svc_port_forward(port_forward_process,
|
|
293
|
+
timeout=terminate_port_forward_timeout)
|
|
294
|
+
raise
|
|
295
|
+
finally:
|
|
296
|
+
if poller is not None and fd is not None:
|
|
297
|
+
try:
|
|
298
|
+
poller.unregister(fd)
|
|
299
|
+
except (OSError, ValueError):
|
|
300
|
+
# FD may already be unregistered or invalid
|
|
301
|
+
pass
|
|
302
|
+
if port_forward_exit:
|
|
303
|
+
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
304
|
+
f'namespace {namespace} on context {context}')
|
|
305
|
+
if local_port is None:
|
|
306
|
+
try:
|
|
307
|
+
if port_forward_process:
|
|
308
|
+
stop_svc_port_forward(port_forward_process,
|
|
309
|
+
timeout=terminate_port_forward_timeout)
|
|
310
|
+
finally:
|
|
311
|
+
raise RuntimeError(
|
|
312
|
+
f'Failed to extract local port for service {service} in '
|
|
313
|
+
f'namespace {namespace} on context {context}')
|
|
314
|
+
|
|
315
|
+
return port_forward_process, local_port
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen,
|
|
319
|
+
timeout: int = 5) -> None:
|
|
320
|
+
"""Stops a port forward to a service in a Kubernetes cluster.
|
|
321
|
+
Args:
|
|
322
|
+
port_forward_process: The subprocess.Popen process to terminate
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
port_forward_process.terminate()
|
|
326
|
+
port_forward_process.wait(timeout=timeout)
|
|
327
|
+
except subprocess.TimeoutExpired:
|
|
328
|
+
port_forward_process.kill()
|
|
329
|
+
port_forward_process.wait()
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
async def send_metrics_request_with_port_forward(
|
|
333
|
+
context: str,
|
|
334
|
+
namespace: str,
|
|
335
|
+
service: str,
|
|
336
|
+
service_port: int,
|
|
337
|
+
endpoint_path: str = '/federate',
|
|
338
|
+
match_patterns: Optional[List[str]] = None,
|
|
339
|
+
timeout: float = 30.0) -> str:
|
|
340
|
+
"""Sends a metrics request to a Prometheus endpoint via port forwarding.
|
|
341
|
+
Args:
|
|
342
|
+
context: Kubernetes context name
|
|
343
|
+
namespace: Namespace where the service is located
|
|
344
|
+
service: Service name to port forward to
|
|
345
|
+
service_port: Port on the service to forward to
|
|
346
|
+
endpoint_path: Path to append to the localhost endpoint (e.g.,
|
|
347
|
+
'/federate')
|
|
348
|
+
match_patterns: List of metric patterns to match (for federate
|
|
349
|
+
endpoint)
|
|
350
|
+
timeout: Request timeout in seconds
|
|
351
|
+
Returns:
|
|
352
|
+
Response text containing the metrics
|
|
353
|
+
Raises:
|
|
354
|
+
RuntimeError: If port forward or HTTP request fails
|
|
355
|
+
"""
|
|
356
|
+
port_forward_process = None
|
|
357
|
+
try:
|
|
358
|
+
# Start port forward
|
|
359
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
360
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
361
|
+
|
|
362
|
+
# Build endpoint URL
|
|
363
|
+
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
364
|
+
|
|
365
|
+
# Make HTTP request
|
|
366
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
367
|
+
if match_patterns:
|
|
368
|
+
# For federate endpoint, add match[] parameters
|
|
369
|
+
params = [('match[]', pattern) for pattern in match_patterns]
|
|
370
|
+
response = await client.get(endpoint, params=params)
|
|
371
|
+
else:
|
|
372
|
+
response = await client.get(endpoint)
|
|
373
|
+
|
|
374
|
+
response.raise_for_status()
|
|
375
|
+
return response.text
|
|
376
|
+
|
|
377
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
378
|
+
logger.error(f'Failed to send metrics request with port forward: '
|
|
379
|
+
f'{common_utils.format_exception(e)}')
|
|
380
|
+
raise
|
|
381
|
+
finally:
|
|
382
|
+
# Always clean up port forward
|
|
383
|
+
if port_forward_process:
|
|
384
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
385
|
+
port_forward_process)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
389
|
+
"""Adds a cluster_name label to each metric line.
|
|
390
|
+
Args:
|
|
391
|
+
metrics_text: The text containing the metrics
|
|
392
|
+
context: The cluster name
|
|
393
|
+
"""
|
|
394
|
+
lines = metrics_text.strip().split('\n')
|
|
395
|
+
modified_lines = []
|
|
396
|
+
|
|
397
|
+
for line in lines:
|
|
398
|
+
# keep comment lines and empty lines as-is
|
|
399
|
+
if line.startswith('#') or not line.strip():
|
|
400
|
+
modified_lines.append(line)
|
|
401
|
+
continue
|
|
402
|
+
# if line is a metric line with labels, add cluster label
|
|
403
|
+
brace_start = line.find('{')
|
|
404
|
+
brace_end = line.find('}')
|
|
405
|
+
if brace_start != -1 and brace_end != -1:
|
|
406
|
+
metric_name = line[:brace_start]
|
|
407
|
+
existing_labels = line[brace_start + 1:brace_end]
|
|
408
|
+
rest_of_line = line[brace_end + 1:]
|
|
409
|
+
|
|
410
|
+
if existing_labels:
|
|
411
|
+
new_labels = f'cluster="{context}",{existing_labels}'
|
|
412
|
+
else:
|
|
413
|
+
new_labels = f'cluster="{context}"'
|
|
414
|
+
|
|
415
|
+
modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
|
|
416
|
+
modified_lines.append(modified_line)
|
|
417
|
+
else:
|
|
418
|
+
# keep other lines as-is
|
|
419
|
+
modified_lines.append(line)
|
|
420
|
+
|
|
421
|
+
return '\n'.join(modified_lines)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
async def get_metrics_for_context(context: str) -> str:
|
|
425
|
+
"""Get GPU metrics for a single Kubernetes context.
|
|
426
|
+
Args:
|
|
427
|
+
context: Kubernetes context name
|
|
428
|
+
Returns:
|
|
429
|
+
metrics_text: String containing the metrics
|
|
430
|
+
Raises:
|
|
431
|
+
Exception: If metrics collection fails for any reason
|
|
432
|
+
"""
|
|
433
|
+
# Query both DCGM metrics and kube_pod_labels metrics
|
|
434
|
+
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
435
|
+
match_patterns = [
|
|
436
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
437
|
+
'kube_pod_labels',
|
|
438
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
439
|
+
]
|
|
440
|
+
|
|
441
|
+
# TODO(rohan): don't hardcode the namespace and service name
|
|
442
|
+
metrics_text = await send_metrics_request_with_port_forward(
|
|
443
|
+
context=context,
|
|
444
|
+
namespace='skypilot',
|
|
445
|
+
service='skypilot-prometheus-server',
|
|
446
|
+
service_port=80,
|
|
447
|
+
endpoint_path='/federate',
|
|
448
|
+
match_patterns=match_patterns)
|
|
449
|
+
|
|
450
|
+
# add cluster name as a label to each metric line
|
|
451
|
+
metrics_text = await add_cluster_name_label(metrics_text, context)
|
|
452
|
+
|
|
453
|
+
return metrics_text
|
sky/models.py
CHANGED
|
@@ -2,19 +2,57 @@
|
|
|
2
2
|
|
|
3
3
|
import collections
|
|
4
4
|
import dataclasses
|
|
5
|
-
|
|
5
|
+
import getpass
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any, ClassVar, Dict, Optional
|
|
8
|
+
|
|
9
|
+
import pydantic
|
|
10
|
+
|
|
11
|
+
from sky.skylet import constants
|
|
12
|
+
from sky.utils import common_utils
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
@dataclasses.dataclass
|
|
9
16
|
class User:
|
|
17
|
+
"""Dataclass to store user information."""
|
|
10
18
|
# User hash
|
|
11
19
|
id: str
|
|
12
20
|
# Display name of the user
|
|
13
21
|
name: Optional[str] = None
|
|
22
|
+
password: Optional[str] = None
|
|
23
|
+
created_at: Optional[int] = None
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
id: str, # pylint: disable=redefined-builtin
|
|
28
|
+
name: Optional[str] = None,
|
|
29
|
+
password: Optional[str] = None,
|
|
30
|
+
created_at: Optional[int] = None):
|
|
31
|
+
self.id = id.strip().lower()
|
|
32
|
+
self.name = name
|
|
33
|
+
self.password = password
|
|
34
|
+
self.created_at = created_at
|
|
14
35
|
|
|
15
36
|
def to_dict(self) -> Dict[str, Any]:
|
|
16
37
|
return {'id': self.id, 'name': self.name}
|
|
17
38
|
|
|
39
|
+
def to_env_vars(self) -> Dict[str, Any]:
|
|
40
|
+
return {
|
|
41
|
+
constants.USER_ID_ENV_VAR: self.id,
|
|
42
|
+
constants.USER_ENV_VAR: self.name,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def get_current_user(cls) -> 'User':
|
|
47
|
+
"""Returns the current user."""
|
|
48
|
+
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
|
49
|
+
user_hash = common_utils.get_user_hash()
|
|
50
|
+
return User(id=user_hash, name=user_name)
|
|
51
|
+
|
|
52
|
+
def is_service_account(self) -> bool:
|
|
53
|
+
"""Check if the user is a service account."""
|
|
54
|
+
return self.id.lower().startswith('sa-')
|
|
55
|
+
|
|
18
56
|
|
|
19
57
|
RealtimeGpuAvailability = collections.namedtuple(
|
|
20
58
|
'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
|
|
@@ -28,6 +66,8 @@ class KubernetesNodeInfo:
|
|
|
28
66
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
|
29
67
|
total: Dict[str, int]
|
|
30
68
|
free: Dict[str, int]
|
|
69
|
+
# IP address of the node (external IP preferred, fallback to internal IP)
|
|
70
|
+
ip_address: Optional[str] = None
|
|
31
71
|
|
|
32
72
|
|
|
33
73
|
@dataclasses.dataclass
|
|
@@ -56,3 +96,40 @@ class KubernetesNodesInfo:
|
|
|
56
96
|
},
|
|
57
97
|
hint=data['hint'],
|
|
58
98
|
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class VolumeConfig(pydantic.BaseModel):
|
|
102
|
+
"""Configuration for creating a volume."""
|
|
103
|
+
# If any fields changed, increment the version. For backward compatibility,
|
|
104
|
+
# modify the __setstate__ method to handle the old version.
|
|
105
|
+
_VERSION: ClassVar[int] = 1
|
|
106
|
+
|
|
107
|
+
_version: int
|
|
108
|
+
name: str
|
|
109
|
+
type: str
|
|
110
|
+
cloud: str
|
|
111
|
+
region: Optional[str]
|
|
112
|
+
zone: Optional[str]
|
|
113
|
+
name_on_cloud: str
|
|
114
|
+
size: Optional[str]
|
|
115
|
+
config: Dict[str, Any] = {}
|
|
116
|
+
labels: Optional[Dict[str, str]] = None
|
|
117
|
+
id_on_cloud: Optional[str] = None
|
|
118
|
+
|
|
119
|
+
def __getstate__(self) -> Dict[str, Any]:
|
|
120
|
+
state = super().__getstate__()
|
|
121
|
+
state['_version'] = self._VERSION
|
|
122
|
+
return state
|
|
123
|
+
|
|
124
|
+
def __setstate__(self, state: Dict[str, Any]) -> None:
|
|
125
|
+
"""Set state from pickled state, for backward compatibility."""
|
|
126
|
+
super().__setstate__(state)
|
|
127
|
+
version = state.pop('_version', None)
|
|
128
|
+
if version is None:
|
|
129
|
+
version = -1
|
|
130
|
+
|
|
131
|
+
if version < 0:
|
|
132
|
+
state['id_on_cloud'] = None
|
|
133
|
+
|
|
134
|
+
state['_version'] = self._VERSION
|
|
135
|
+
self.__dict__.update(state)
|