skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/metrics.py
CHANGED
|
@@ -1,74 +1,33 @@
|
|
|
1
1
|
"""Instrumentation for the API server."""
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import
|
|
3
|
+
import asyncio
|
|
4
|
+
import multiprocessing
|
|
5
5
|
import os
|
|
6
|
+
import threading
|
|
6
7
|
import time
|
|
8
|
+
from typing import List
|
|
7
9
|
|
|
8
10
|
import fastapi
|
|
9
11
|
from prometheus_client import generate_latest
|
|
10
12
|
from prometheus_client import multiprocess
|
|
11
13
|
import prometheus_client as prom
|
|
14
|
+
import psutil
|
|
12
15
|
import starlette.middleware.base
|
|
13
16
|
import uvicorn
|
|
14
17
|
|
|
18
|
+
from sky import core
|
|
15
19
|
from sky import sky_logging
|
|
16
|
-
from sky.
|
|
17
|
-
|
|
18
|
-
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
19
|
-
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
20
|
-
'false').lower() == 'true'
|
|
20
|
+
from sky.metrics import utils as metrics_utils
|
|
21
21
|
|
|
22
22
|
logger = sky_logging.init_logger(__name__)
|
|
23
23
|
|
|
24
|
-
# Total number of API server requests, grouped by path, method, and status.
|
|
25
|
-
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
26
|
-
'sky_apiserver_requests_total',
|
|
27
|
-
'Total number of API server requests',
|
|
28
|
-
['path', 'method', 'status'],
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
# Time spent processing API server requests, grouped by path, method, and
|
|
32
|
-
# status.
|
|
33
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
34
|
-
'sky_apiserver_request_duration_seconds',
|
|
35
|
-
'Time spent processing API server requests',
|
|
36
|
-
['path', 'method', 'status'],
|
|
37
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
38
|
-
60.0, 120.0, float('inf')),
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Time spent processing requests in executor.
|
|
42
|
-
SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
|
|
43
|
-
'sky_apiserver_request_execution_duration_seconds',
|
|
44
|
-
'Time spent executing requests in executor',
|
|
45
|
-
['request', 'worker'],
|
|
46
|
-
buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
|
|
47
|
-
float('inf')),
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
# Time spent processing a piece of code, refer to time_it().
|
|
51
|
-
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
52
|
-
'sky_apiserver_code_duration_seconds',
|
|
53
|
-
'Time spent processing code',
|
|
54
|
-
['name', 'group'],
|
|
55
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
56
|
-
60.0, 120.0, float('inf')),
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
60
|
-
'sky_apiserver_event_loop_lag_seconds',
|
|
61
|
-
'Scheduling delay of the server event loop',
|
|
62
|
-
['pid'],
|
|
63
|
-
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
64
|
-
60.0, float('inf')),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
24
|
metrics_app = fastapi.FastAPI()
|
|
68
25
|
|
|
69
26
|
|
|
27
|
+
# Serve /metrics in dedicated thread to avoid blocking the event loop
|
|
28
|
+
# of metrics server.
|
|
70
29
|
@metrics_app.get('/metrics')
|
|
71
|
-
|
|
30
|
+
def metrics() -> fastapi.Response:
|
|
72
31
|
"""Expose aggregated Prometheus metrics from all worker processes."""
|
|
73
32
|
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
|
74
33
|
# In multiprocess mode, we need to collect metrics from all processes.
|
|
@@ -82,6 +41,42 @@ async def metrics() -> fastapi.Response:
|
|
|
82
41
|
headers={'Cache-Control': 'no-cache'})
|
|
83
42
|
|
|
84
43
|
|
|
44
|
+
@metrics_app.get('/gpu-metrics')
|
|
45
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
46
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
47
|
+
contexts = core.get_all_contexts()
|
|
48
|
+
all_metrics: List[str] = []
|
|
49
|
+
successful_contexts = 0
|
|
50
|
+
|
|
51
|
+
tasks = [
|
|
52
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
53
|
+
for context in contexts
|
|
54
|
+
if context != 'in-cluster'
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
58
|
+
|
|
59
|
+
for i, result in enumerate(results):
|
|
60
|
+
if isinstance(result, Exception):
|
|
61
|
+
logger.error(
|
|
62
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
63
|
+
elif isinstance(result, BaseException):
|
|
64
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
65
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
66
|
+
raise result
|
|
67
|
+
else:
|
|
68
|
+
metrics_text = result
|
|
69
|
+
all_metrics.append(metrics_text)
|
|
70
|
+
successful_contexts += 1
|
|
71
|
+
|
|
72
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
73
|
+
|
|
74
|
+
# Return as plain text for Prometheus compatibility
|
|
75
|
+
return fastapi.Response(
|
|
76
|
+
content=combined_metrics,
|
|
77
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
78
|
+
|
|
79
|
+
|
|
85
80
|
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
|
86
81
|
metrics_config = uvicorn.Config(
|
|
87
82
|
'sky.server.metrics:metrics_app',
|
|
@@ -125,56 +120,41 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
125
120
|
status_code_group = '5xx'
|
|
126
121
|
raise
|
|
127
122
|
finally:
|
|
128
|
-
SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
129
|
-
|
|
130
|
-
status=status_code_group).inc()
|
|
123
|
+
metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
124
|
+
path=path, method=method, status=status_code_group).inc()
|
|
131
125
|
if not streaming:
|
|
132
126
|
duration = time.time() - start_time
|
|
133
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
127
|
+
metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
134
128
|
path=path, method=method,
|
|
135
129
|
status=status_code_group).observe(duration)
|
|
136
130
|
|
|
137
131
|
return response
|
|
138
132
|
|
|
139
133
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def time_me_async(func):
|
|
170
|
-
"""Measure the duration of decorated async function."""
|
|
171
|
-
|
|
172
|
-
@functools.wraps(func)
|
|
173
|
-
async def async_wrapper(*args, **kwargs):
|
|
174
|
-
if not METRICS_ENABLED:
|
|
175
|
-
return await func(*args, **kwargs)
|
|
176
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
177
|
-
with time_it(name, group='function'):
|
|
178
|
-
return await func(*args, **kwargs)
|
|
179
|
-
|
|
180
|
-
return async_wrapper
|
|
134
|
+
peak_rss_bytes = 0
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
138
|
+
pid = multiprocessing.current_process().pid
|
|
139
|
+
proc = psutil.Process(pid)
|
|
140
|
+
last_bucket_end = time.time()
|
|
141
|
+
bucket_peak = 0
|
|
142
|
+
global peak_rss_bytes
|
|
143
|
+
while not stop.is_set():
|
|
144
|
+
if time.time() - last_bucket_end >= 30:
|
|
145
|
+
# Reset peak RSS for the next time bucket.
|
|
146
|
+
last_bucket_end = time.time()
|
|
147
|
+
bucket_peak = 0
|
|
148
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
149
|
+
metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
150
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
151
|
+
ctimes = proc.cpu_times()
|
|
152
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
153
|
+
type=process_type,
|
|
154
|
+
mode='user').set(
|
|
155
|
+
ctimes.user)
|
|
156
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
157
|
+
type=process_type,
|
|
158
|
+
mode='system').set(
|
|
159
|
+
ctimes.system)
|
|
160
|
+
time.sleep(1)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Utilities for building middlewares."""
|
|
2
|
+
import enum
|
|
3
|
+
import http
|
|
4
|
+
from typing import Type
|
|
5
|
+
|
|
6
|
+
import fastapi
|
|
7
|
+
import starlette.middleware.base
|
|
8
|
+
import starlette.types
|
|
9
|
+
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
|
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WebSocketDecision(enum.Enum):
|
|
16
|
+
ACCEPT = 'accept'
|
|
17
|
+
UNAUTHORIZED = 'unauthorized'
|
|
18
|
+
FORBIDDEN = 'forbidden'
|
|
19
|
+
ERROR = 'error'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def websocket_aware(
|
|
23
|
+
middleware_cls: Type[starlette.middleware.base.BaseHTTPMiddleware]):
|
|
24
|
+
"""Decorator to adapt BaseHTTPMiddleware to handle WebSockets.
|
|
25
|
+
|
|
26
|
+
It assembles an HTTP-style request like the HTTP upgrade request during
|
|
27
|
+
websocket handshake and then delegates it to the real HTTP middleware.
|
|
28
|
+
The websocket connection will be rejected if the HTTP middleware returns
|
|
29
|
+
a 4xx or 5xx status code.
|
|
30
|
+
|
|
31
|
+
Note: for websocket connection, the mutation made by the underlying HTTP
|
|
32
|
+
middleware on the request and response will be discarded.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
class WebSocketAwareMiddleware:
|
|
36
|
+
"""WebSocket-aware middleware wrapper."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, app: starlette.types.ASGIApp, *args, **kwargs):
|
|
39
|
+
self.app = app
|
|
40
|
+
self.middleware = middleware_cls(app, *args, **kwargs)
|
|
41
|
+
|
|
42
|
+
async def __call__(self, scope: starlette.types.Scope,
|
|
43
|
+
receive: starlette.types.Receive,
|
|
44
|
+
send: starlette.types.Send):
|
|
45
|
+
scope_type = scope.get('type')
|
|
46
|
+
if scope_type == 'websocket':
|
|
47
|
+
await self._handle_websocket(scope, receive, send)
|
|
48
|
+
else:
|
|
49
|
+
# Delegate other scopes to the underlying HTTP middleware.
|
|
50
|
+
await self.middleware(scope, receive, send)
|
|
51
|
+
|
|
52
|
+
async def dispatch(
|
|
53
|
+
self, request: fastapi.Request,
|
|
54
|
+
call_next: starlette.middleware.base.RequestResponseEndpoint):
|
|
55
|
+
"""Implement dispatch method to keep compatibility."""
|
|
56
|
+
return await self.middleware.dispatch(request, call_next)
|
|
57
|
+
|
|
58
|
+
async def _handle_websocket(self, scope: starlette.types.Scope,
|
|
59
|
+
receive: starlette.types.Receive,
|
|
60
|
+
send: starlette.types.Send):
|
|
61
|
+
"""Handle websocket connection by delegating to HTTP middleware."""
|
|
62
|
+
decision = await self._run_websocket_dispatch(scope)
|
|
63
|
+
if decision == WebSocketDecision.ACCEPT:
|
|
64
|
+
await self.app(scope, receive, send)
|
|
65
|
+
elif decision == WebSocketDecision.UNAUTHORIZED:
|
|
66
|
+
await send({
|
|
67
|
+
'type': 'websocket.close',
|
|
68
|
+
'code': 4401,
|
|
69
|
+
'reason': 'Unauthorized',
|
|
70
|
+
})
|
|
71
|
+
elif decision == WebSocketDecision.FORBIDDEN:
|
|
72
|
+
await send({
|
|
73
|
+
'type': 'websocket.close',
|
|
74
|
+
'code': 4403,
|
|
75
|
+
'reason': 'Forbidden',
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
await send({
|
|
79
|
+
'type': 'websocket.close',
|
|
80
|
+
'code': 1011,
|
|
81
|
+
'reason': 'Internal Server Error',
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
async def _run_websocket_dispatch(
|
|
85
|
+
self, scope: starlette.types.Scope) -> WebSocketDecision:
|
|
86
|
+
http_scope = self._build_http_scope(scope)
|
|
87
|
+
http_receive = self._http_receive_adapter()
|
|
88
|
+
request = fastapi.Request(http_scope, receive=http_receive)
|
|
89
|
+
call_next_called = False
|
|
90
|
+
stub_response = fastapi.Response(status_code=http.HTTPStatus.OK)
|
|
91
|
+
|
|
92
|
+
async def call_next(req):
|
|
93
|
+
del req
|
|
94
|
+
# Capture whether call_next() is called in the underlying
|
|
95
|
+
# HTTP middleware to determine if we can proceed with current
|
|
96
|
+
# websocket connection.
|
|
97
|
+
nonlocal call_next_called
|
|
98
|
+
call_next_called = True
|
|
99
|
+
return stub_response
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
response = await self.dispatch(request, call_next)
|
|
103
|
+
except Exception as e: # pylint: disable=broad-except
|
|
104
|
+
logger.error('Exception occurred in middleware dispatch for '
|
|
105
|
+
f'WebSocket scope: {e}')
|
|
106
|
+
return WebSocketDecision.ERROR
|
|
107
|
+
|
|
108
|
+
if response is None:
|
|
109
|
+
response = stub_response
|
|
110
|
+
|
|
111
|
+
status_code = response.status_code
|
|
112
|
+
|
|
113
|
+
if call_next_called and 200 <= status_code < 400:
|
|
114
|
+
return WebSocketDecision.ACCEPT
|
|
115
|
+
if status_code == http.HTTPStatus.UNAUTHORIZED:
|
|
116
|
+
return WebSocketDecision.UNAUTHORIZED
|
|
117
|
+
if status_code == http.HTTPStatus.FORBIDDEN:
|
|
118
|
+
return WebSocketDecision.FORBIDDEN
|
|
119
|
+
return WebSocketDecision.ERROR
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _build_http_scope(
|
|
123
|
+
scope: starlette.types.Scope) -> starlette.types.Scope:
|
|
124
|
+
state = scope.setdefault('state', {})
|
|
125
|
+
scheme = scope.get('scheme', 'ws')
|
|
126
|
+
if scheme == 'ws':
|
|
127
|
+
http_scheme = 'http'
|
|
128
|
+
elif scheme == 'wss':
|
|
129
|
+
http_scheme = 'https'
|
|
130
|
+
else:
|
|
131
|
+
http_scheme = scheme
|
|
132
|
+
http_scope = dict(scope)
|
|
133
|
+
http_scope['type'] = 'http'
|
|
134
|
+
http_scope['scheme'] = http_scheme
|
|
135
|
+
http_scope['method'] = 'GET'
|
|
136
|
+
http_scope['http_version'] = scope.get('http_version', '1.1')
|
|
137
|
+
http_scope['state'] = state
|
|
138
|
+
return http_scope
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _http_receive_adapter() -> starlette.types.Receive:
|
|
142
|
+
"""Adapter thatmimics the sequence produced by Starlette for an HTTP
|
|
143
|
+
request: a single http.request event followed by a http.disconnect
|
|
144
|
+
"""
|
|
145
|
+
sent = False
|
|
146
|
+
|
|
147
|
+
async def receive():
|
|
148
|
+
nonlocal sent
|
|
149
|
+
if not sent:
|
|
150
|
+
sent = True
|
|
151
|
+
return {
|
|
152
|
+
'type': 'http.request',
|
|
153
|
+
'body': b'',
|
|
154
|
+
'more_body': False,
|
|
155
|
+
}
|
|
156
|
+
return {
|
|
157
|
+
'type': 'http.disconnect',
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return receive
|
|
161
|
+
|
|
162
|
+
WebSocketAwareMiddleware.__name__ = middleware_cls.__name__
|
|
163
|
+
WebSocketAwareMiddleware.__qualname__ = middleware_cls.__qualname__
|
|
164
|
+
WebSocketAwareMiddleware.__module__ = middleware_cls.__module__
|
|
165
|
+
WebSocketAwareMiddleware.__doc__ = middleware_cls.__doc__
|
|
166
|
+
return WebSocketAwareMiddleware
|
sky/server/plugins.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Load plugins for the SkyPilot API server."""
|
|
2
|
+
import abc
|
|
3
|
+
import dataclasses
|
|
4
|
+
import importlib
|
|
5
|
+
import os
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
from fastapi import FastAPI
|
|
9
|
+
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
from sky.skylet import constants as skylet_constants
|
|
12
|
+
from sky.utils import common_utils
|
|
13
|
+
from sky.utils import config_utils
|
|
14
|
+
from sky.utils import yaml_utils
|
|
15
|
+
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
|
17
|
+
|
|
18
|
+
_DEFAULT_PLUGINS_CONFIG_PATH = '~/.sky/plugins.yaml'
|
|
19
|
+
_PLUGINS_CONFIG_ENV_VAR = (
|
|
20
|
+
f'{skylet_constants.SKYPILOT_SERVER_ENV_VAR_PREFIX}PLUGINS_CONFIG')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ExtensionContext:
|
|
24
|
+
"""Context provided to plugins during installation.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
app: The FastAPI application instance.
|
|
28
|
+
rbac_rules: List of RBAC rules registered by the plugin.
|
|
29
|
+
Example:
|
|
30
|
+
[
|
|
31
|
+
('user', RBACRule(path='/plugins/api/xx/*', method='POST')),
|
|
32
|
+
('user', RBACRule(path='/plugins/api/xx/*', method='DELETE'))
|
|
33
|
+
]
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, app: Optional[FastAPI] = None):
|
|
37
|
+
self.app = app
|
|
38
|
+
self.rbac_rules: List[Tuple[str, RBACRule]] = []
|
|
39
|
+
|
|
40
|
+
def register_rbac_rule(self,
|
|
41
|
+
path: str,
|
|
42
|
+
method: str,
|
|
43
|
+
description: Optional[str] = None,
|
|
44
|
+
role: str = 'user') -> None:
|
|
45
|
+
"""Register an RBAC rule for this plugin.
|
|
46
|
+
|
|
47
|
+
This method allows plugins to declare which endpoints should be
|
|
48
|
+
restricted to admin users during the install phase.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
path: The path pattern to restrict (supports wildcards with
|
|
52
|
+
keyMatch2).
|
|
53
|
+
Example: '/plugins/api/credentials/*'
|
|
54
|
+
method: The HTTP method to restrict. Example: 'POST', 'DELETE'
|
|
55
|
+
description: Optional description of what this rule protects.
|
|
56
|
+
role: The role to add this rule to (default: 'user').
|
|
57
|
+
Rules added to 'user' role block regular users but allow
|
|
58
|
+
admins.
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
def install(self, ctx: ExtensionContext):
|
|
62
|
+
# Only admin can upload credentials
|
|
63
|
+
ctx.register_rbac_rule(
|
|
64
|
+
path='/plugins/api/credentials/*',
|
|
65
|
+
method='POST',
|
|
66
|
+
description='Only admin can upload credentials'
|
|
67
|
+
)
|
|
68
|
+
"""
|
|
69
|
+
rule = RBACRule(path=path, method=method, description=description)
|
|
70
|
+
self.rbac_rules.append((role, rule))
|
|
71
|
+
logger.debug(f'Registered RBAC rule for {role}: {method} {path}'
|
|
72
|
+
f'{f" - {description}" if description else ""}')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclasses.dataclass
|
|
76
|
+
class RBACRule:
|
|
77
|
+
"""RBAC rule for a plugin endpoint.
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
path: The path pattern to match (supports wildcards with keyMatch2).
|
|
81
|
+
Example: '/plugins/api/credentials/*'
|
|
82
|
+
method: The HTTP method to restrict. Example: 'POST', 'DELETE'
|
|
83
|
+
description: Optional description of what this rule protects.
|
|
84
|
+
"""
|
|
85
|
+
path: str
|
|
86
|
+
method: str
|
|
87
|
+
description: Optional[str] = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class BasePlugin(abc.ABC):
|
|
91
|
+
"""Base class for all SkyPilot server plugins."""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def js_extension_path(self) -> Optional[str]:
|
|
95
|
+
"""Optional API route to the JavaScript extension to load."""
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
@abc.abstractmethod
|
|
99
|
+
def install(self, extension_context: ExtensionContext):
|
|
100
|
+
"""Hook called by API server to let the plugin install itself."""
|
|
101
|
+
raise NotImplementedError
|
|
102
|
+
|
|
103
|
+
def shutdown(self):
|
|
104
|
+
"""Hook called by API server to let the plugin shutdown."""
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _config_schema():
|
|
109
|
+
plugin_schema = {
|
|
110
|
+
'type': 'object',
|
|
111
|
+
'required': ['class'],
|
|
112
|
+
'additionalProperties': False,
|
|
113
|
+
'properties': {
|
|
114
|
+
'class': {
|
|
115
|
+
'type': 'string',
|
|
116
|
+
},
|
|
117
|
+
'parameters': {
|
|
118
|
+
'type': 'object',
|
|
119
|
+
'required': [],
|
|
120
|
+
'additionalProperties': True,
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
}
|
|
124
|
+
return {
|
|
125
|
+
'type': 'object',
|
|
126
|
+
'required': [],
|
|
127
|
+
'additionalProperties': False,
|
|
128
|
+
'properties': {
|
|
129
|
+
'plugins': {
|
|
130
|
+
'type': 'array',
|
|
131
|
+
'items': plugin_schema,
|
|
132
|
+
'default': [],
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _load_plugin_config() -> Optional[config_utils.Config]:
|
|
139
|
+
"""Load plugin config."""
|
|
140
|
+
config_path = os.getenv(_PLUGINS_CONFIG_ENV_VAR,
|
|
141
|
+
_DEFAULT_PLUGINS_CONFIG_PATH)
|
|
142
|
+
config_path = os.path.expanduser(config_path)
|
|
143
|
+
if not os.path.exists(config_path):
|
|
144
|
+
return None
|
|
145
|
+
config = yaml_utils.read_yaml(config_path) or {}
|
|
146
|
+
common_utils.validate_schema(config,
|
|
147
|
+
_config_schema(),
|
|
148
|
+
err_msg_prefix='Invalid plugins config: ')
|
|
149
|
+
return config_utils.Config.from_dict(config)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
_PLUGINS: Dict[str, BasePlugin] = {}
|
|
153
|
+
_EXTENSION_CONTEXT: Optional[ExtensionContext] = None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def load_plugins(extension_context: ExtensionContext):
|
|
157
|
+
"""Load and initialize plugins from the config."""
|
|
158
|
+
global _EXTENSION_CONTEXT
|
|
159
|
+
_EXTENSION_CONTEXT = extension_context
|
|
160
|
+
|
|
161
|
+
config = _load_plugin_config()
|
|
162
|
+
if not config:
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
for plugin_config in config.get('plugins', []):
|
|
166
|
+
class_path = plugin_config['class']
|
|
167
|
+
module_path, class_name = class_path.rsplit('.', 1)
|
|
168
|
+
try:
|
|
169
|
+
module = importlib.import_module(module_path)
|
|
170
|
+
except ImportError as e:
|
|
171
|
+
raise ImportError(
|
|
172
|
+
f'Failed to import plugin module: {module_path}. '
|
|
173
|
+
'Please check if the module is installed in your Python '
|
|
174
|
+
'environment.') from e
|
|
175
|
+
try:
|
|
176
|
+
plugin_cls = getattr(module, class_name)
|
|
177
|
+
except AttributeError as e:
|
|
178
|
+
raise AttributeError(
|
|
179
|
+
f'Could not find plugin {class_name} class in module '
|
|
180
|
+
f'{module_path}. ') from e
|
|
181
|
+
if not issubclass(plugin_cls, BasePlugin):
|
|
182
|
+
raise TypeError(
|
|
183
|
+
f'Plugin {class_path} must inherit from BasePlugin.')
|
|
184
|
+
parameters = plugin_config.get('parameters') or {}
|
|
185
|
+
plugin = plugin_cls(**parameters)
|
|
186
|
+
plugin.install(extension_context)
|
|
187
|
+
_PLUGINS[class_path] = plugin
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_plugins() -> List[BasePlugin]:
|
|
191
|
+
"""Return shallow copies of the registered plugins."""
|
|
192
|
+
return list(_PLUGINS.values())
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_plugin_rbac_rules() -> Dict[str, List[Dict[str, str]]]:
|
|
196
|
+
"""Collect RBAC rules from all loaded plugins.
|
|
197
|
+
|
|
198
|
+
Collects rules from the ExtensionContext.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dictionary mapping role names to lists of blocklist rules.
|
|
202
|
+
Example:
|
|
203
|
+
{
|
|
204
|
+
'user': [
|
|
205
|
+
{'path': '/plugins/api/credentials/*', 'method': 'POST'},
|
|
206
|
+
{'path': '/plugins/api/credentials/*', 'method': 'DELETE'}
|
|
207
|
+
]
|
|
208
|
+
}
|
|
209
|
+
"""
|
|
210
|
+
rules_by_role: Dict[str, List[Dict[str, str]]] = {}
|
|
211
|
+
|
|
212
|
+
# Collect rules registered via ExtensionContext
|
|
213
|
+
if _EXTENSION_CONTEXT:
|
|
214
|
+
for role, rule in _EXTENSION_CONTEXT.rbac_rules:
|
|
215
|
+
if role not in rules_by_role:
|
|
216
|
+
rules_by_role[role] = []
|
|
217
|
+
rules_by_role[role].append({
|
|
218
|
+
'path': rule.path,
|
|
219
|
+
'method': rule.method,
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
return rules_by_role
|