skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""JWT-based service account token management for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import datetime
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
import secrets
|
|
8
|
+
import threading
|
|
9
|
+
from typing import Any, Dict, Generator, Optional
|
|
10
|
+
|
|
11
|
+
import filelock
|
|
12
|
+
import jwt
|
|
13
|
+
|
|
14
|
+
from sky import global_user_state
|
|
15
|
+
from sky import sky_logging
|
|
16
|
+
|
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
|
18
|
+
|
|
19
|
+
# JWT Configuration
|
|
20
|
+
JWT_ALGORITHM = 'HS256'
|
|
21
|
+
JWT_ISSUER = 'sky' # Shortened for compact tokens
|
|
22
|
+
JWT_SECRET_DB_KEY = 'jwt_secret'
|
|
23
|
+
|
|
24
|
+
# File lock for JWT secret initialization
|
|
25
|
+
JWT_SECRET_LOCK_PATH = os.path.expanduser('~/.sky/.jwt_secret_init.lock')
|
|
26
|
+
JWT_SECRET_LOCK_TIMEOUT_SECONDS = 20
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@contextlib.contextmanager
|
|
30
|
+
def _jwt_secret_lock() -> Generator[None, None, None]:
|
|
31
|
+
"""Context manager for JWT secret initialization lock."""
|
|
32
|
+
try:
|
|
33
|
+
with filelock.FileLock(JWT_SECRET_LOCK_PATH,
|
|
34
|
+
JWT_SECRET_LOCK_TIMEOUT_SECONDS):
|
|
35
|
+
yield
|
|
36
|
+
except filelock.Timeout as e:
|
|
37
|
+
raise RuntimeError(f'Failed to initialize JWT secret due to a timeout '
|
|
38
|
+
f'when trying to acquire the lock at '
|
|
39
|
+
f'{JWT_SECRET_LOCK_PATH}. '
|
|
40
|
+
'Please try again or manually remove the lock '
|
|
41
|
+
f'file if you believe it is stale.') from e
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TokenService:
|
|
45
|
+
"""Service for managing JWT-based service account tokens."""
|
|
46
|
+
|
|
47
|
+
def __init__(self):
|
|
48
|
+
self.secret_key = None
|
|
49
|
+
self.init_lock = threading.Lock()
|
|
50
|
+
|
|
51
|
+
def _lazy_initialize(self):
|
|
52
|
+
if self.secret_key is not None:
|
|
53
|
+
return
|
|
54
|
+
with self.init_lock:
|
|
55
|
+
if self.secret_key is not None:
|
|
56
|
+
return
|
|
57
|
+
self.secret_key = self._get_or_generate_secret()
|
|
58
|
+
|
|
59
|
+
def _get_or_generate_secret(self) -> str:
|
|
60
|
+
"""Get JWT secret from database or generate a new one."""
|
|
61
|
+
|
|
62
|
+
def _get_secret_from_db():
|
|
63
|
+
try:
|
|
64
|
+
db_secret = global_user_state.get_system_config(
|
|
65
|
+
JWT_SECRET_DB_KEY)
|
|
66
|
+
if db_secret:
|
|
67
|
+
logger.debug('Retrieved existing JWT secret from database')
|
|
68
|
+
return db_secret
|
|
69
|
+
except Exception as e: # pylint: disable=broad-except
|
|
70
|
+
logger.debug(f'Failed to get JWT secret from database: {e}')
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Try to get from database (persistent across deployments)
|
|
74
|
+
token_from_db = _get_secret_from_db()
|
|
75
|
+
if token_from_db:
|
|
76
|
+
return token_from_db
|
|
77
|
+
|
|
78
|
+
with _jwt_secret_lock():
|
|
79
|
+
token_from_db = _get_secret_from_db()
|
|
80
|
+
if token_from_db:
|
|
81
|
+
return token_from_db
|
|
82
|
+
# Generate a new secret and store in database
|
|
83
|
+
new_secret = secrets.token_urlsafe(64)
|
|
84
|
+
try:
|
|
85
|
+
global_user_state.set_system_config(JWT_SECRET_DB_KEY,
|
|
86
|
+
new_secret)
|
|
87
|
+
logger.info(
|
|
88
|
+
'Generated new JWT secret and stored in database. '
|
|
89
|
+
'This secret will persist across API server restarts.')
|
|
90
|
+
except Exception as e: # pylint: disable=broad-except
|
|
91
|
+
logger.warning(
|
|
92
|
+
f'Failed to store new JWT secret in database: {e}. '
|
|
93
|
+
f'Using in-memory secret (tokens will not persist '
|
|
94
|
+
f'across restarts).')
|
|
95
|
+
|
|
96
|
+
return new_secret
|
|
97
|
+
|
|
98
|
+
def create_token(self,
|
|
99
|
+
creator_user_id: str,
|
|
100
|
+
service_account_user_id: str,
|
|
101
|
+
token_name: str,
|
|
102
|
+
expires_in_days: Optional[int] = None) -> Dict[str, Any]:
|
|
103
|
+
"""Create a new JWT service account token.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
creator_user_id: The creator's user hash
|
|
107
|
+
service_account_user_id: The service account's own user ID
|
|
108
|
+
token_name: Descriptive name for the token
|
|
109
|
+
expires_in_days: Optional expiration in days
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dict containing token info including the JWT token
|
|
113
|
+
"""
|
|
114
|
+
self._lazy_initialize()
|
|
115
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
|
116
|
+
token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
|
|
117
|
+
|
|
118
|
+
# Build minimal JWT payload with single-character field names for
|
|
119
|
+
# compactness
|
|
120
|
+
payload = {
|
|
121
|
+
'i': JWT_ISSUER, # Issuer (use constant)
|
|
122
|
+
't': int(now.timestamp()), # Issued at (shortened from 'iat')
|
|
123
|
+
# Service account user ID (shortened from 'sub')
|
|
124
|
+
'u': service_account_user_id,
|
|
125
|
+
'k': token_id, # Token ID (shortened from 'token_id')
|
|
126
|
+
'y': 'sa', # Type: service account (shortened from 'type')
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Add expiration if specified
|
|
130
|
+
expires_at = None
|
|
131
|
+
if expires_in_days:
|
|
132
|
+
exp_time = now + datetime.timedelta(days=expires_in_days)
|
|
133
|
+
payload['e'] = int(
|
|
134
|
+
exp_time.timestamp()) # Expiration (shortened from 'exp')
|
|
135
|
+
expires_at = int(exp_time.timestamp())
|
|
136
|
+
|
|
137
|
+
# Generate JWT
|
|
138
|
+
jwt_token = jwt.encode(payload,
|
|
139
|
+
self.secret_key,
|
|
140
|
+
algorithm=JWT_ALGORITHM)
|
|
141
|
+
|
|
142
|
+
# Create token with SkyPilot prefix
|
|
143
|
+
full_token = f'sky_{jwt_token}'
|
|
144
|
+
|
|
145
|
+
# Generate hash for database storage (we still hash the full token)
|
|
146
|
+
token_hash = hashlib.sha256(full_token.encode()).hexdigest()
|
|
147
|
+
|
|
148
|
+
return {
|
|
149
|
+
'token_id': token_id,
|
|
150
|
+
'token': full_token,
|
|
151
|
+
'token_hash': token_hash,
|
|
152
|
+
'creator_user_id': creator_user_id,
|
|
153
|
+
'service_account_user_id': service_account_user_id,
|
|
154
|
+
'token_name': token_name,
|
|
155
|
+
'created_at': int(now.timestamp()),
|
|
156
|
+
'expires_at': expires_at,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def verify_token(self, token: str) -> Optional[Dict[str, Any]]:
|
|
160
|
+
"""Verify and decode a JWT token.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
token: The full token (with sky_ prefix)
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Decoded token payload or None if invalid
|
|
167
|
+
"""
|
|
168
|
+
self._lazy_initialize()
|
|
169
|
+
if not token.startswith('sky_'):
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
# Remove the sky_ prefix
|
|
173
|
+
jwt_token = token[4:]
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
# Decode and verify JWT (without issuer verification)
|
|
177
|
+
payload = jwt.decode(jwt_token,
|
|
178
|
+
self.secret_key,
|
|
179
|
+
algorithms=[JWT_ALGORITHM])
|
|
180
|
+
|
|
181
|
+
# Manually verify issuer using our shortened field name
|
|
182
|
+
token_issuer = payload.get('i')
|
|
183
|
+
if token_issuer != JWT_ISSUER:
|
|
184
|
+
logger.warning(f'Invalid token issuer: {token_issuer}')
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
# Verify token type
|
|
188
|
+
token_type = payload.get('y')
|
|
189
|
+
if token_type != 'sa':
|
|
190
|
+
logger.warning(f'Invalid token type: {token_type}')
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
# Convert shortened field names back to standard names for
|
|
194
|
+
# compatibility
|
|
195
|
+
normalized_payload = {
|
|
196
|
+
'iss': payload.get('i'), # issuer
|
|
197
|
+
'iat': payload.get('t'), # issued at
|
|
198
|
+
'sub': payload.get('u'), # subject (service account user ID)
|
|
199
|
+
'token_id': payload.get('k'), # token ID
|
|
200
|
+
'type': 'service_account', # expand shortened type
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# Add expiration if present
|
|
204
|
+
if 'e' in payload:
|
|
205
|
+
normalized_payload['exp'] = payload['e']
|
|
206
|
+
|
|
207
|
+
return normalized_payload
|
|
208
|
+
|
|
209
|
+
except jwt.ExpiredSignatureError:
|
|
210
|
+
logger.warning('Token has expired')
|
|
211
|
+
return None
|
|
212
|
+
except jwt.InvalidTokenError as e:
|
|
213
|
+
logger.warning(f'Invalid token: {e}')
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# Singleton instance
|
|
218
|
+
token_service = TokenService()
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""Accelerator registry."""
|
|
2
2
|
import typing
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import List, Optional
|
|
4
4
|
|
|
5
|
-
from sky
|
|
5
|
+
from sky import catalog
|
|
6
6
|
from sky.utils import rich_utils
|
|
7
7
|
from sky.utils import ux_utils
|
|
8
8
|
|
|
@@ -34,7 +34,8 @@ if typing.TYPE_CHECKING:
|
|
|
34
34
|
|
|
35
35
|
# Use a cached version of accelerators to cloud mapping, so that we don't have
|
|
36
36
|
# to download and read the catalog file for every cloud locally.
|
|
37
|
-
_accelerator_df =
|
|
37
|
+
_accelerator_df = catalog.common.read_catalog('common/accelerators.csv')
|
|
38
|
+
_memory_df = catalog.common.read_catalog('common/metadata.csv')
|
|
38
39
|
|
|
39
40
|
# List of non-GPU accelerators that are supported by our backend for job queue
|
|
40
41
|
# scheduling.
|
|
@@ -45,6 +46,32 @@ _SCHEDULABLE_NON_GPU_ACCELERATORS = [
|
|
|
45
46
|
]
|
|
46
47
|
|
|
47
48
|
|
|
49
|
+
def get_devices_by_memory(memory: float,
|
|
50
|
+
plus: bool = False,
|
|
51
|
+
manufacturer: Optional[str] = None) -> List[str]:
|
|
52
|
+
"""Returns a list of device names that meet the memory and manufacturer
|
|
53
|
+
requirements.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
memory: The minimum memory size in GB.
|
|
57
|
+
plus: If True, returns devices with memory >= memory, otherwise returns
|
|
58
|
+
devices with memory == memory.
|
|
59
|
+
manufacturer: The manufacturer of the GPU.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Filter by memory requirements
|
|
63
|
+
if plus:
|
|
64
|
+
df = _memory_df[_memory_df['MemoryGB'] >= memory]
|
|
65
|
+
else:
|
|
66
|
+
df = _memory_df[_memory_df['MemoryGB'] == memory]
|
|
67
|
+
|
|
68
|
+
# Filter by manufacturer if specified
|
|
69
|
+
if manufacturer is not None:
|
|
70
|
+
df = df[df['Manufacturer'].str.lower() == manufacturer.lower()]
|
|
71
|
+
|
|
72
|
+
return df['GPU'].tolist()
|
|
73
|
+
|
|
74
|
+
|
|
48
75
|
def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
|
|
49
76
|
"""Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
|
|
50
77
|
for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
|
|
@@ -80,10 +107,12 @@ def canonicalize_accelerator_name(accelerator: str,
|
|
|
80
107
|
if not names and cloud_str in ['Kubernetes', None]:
|
|
81
108
|
with rich_utils.safe_status(
|
|
82
109
|
ux_utils.spinner_message('Listing accelerators on Kubernetes')):
|
|
83
|
-
|
|
110
|
+
# Only search for Kubernetes to reduce the lookup cost.
|
|
111
|
+
# For other clouds, the catalog has been searched in previous steps.
|
|
112
|
+
searched = catalog.list_accelerators(
|
|
84
113
|
name_filter=accelerator,
|
|
85
114
|
case_sensitive=False,
|
|
86
|
-
clouds=
|
|
115
|
+
clouds='Kubernetes',
|
|
87
116
|
)
|
|
88
117
|
names = list(searched.keys())
|
|
89
118
|
if accelerator in names:
|
sky/utils/admin_policy_utils.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"""Admin policy utils."""
|
|
2
|
+
import contextlib
|
|
2
3
|
import copy
|
|
3
4
|
import importlib
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
|
|
5
|
+
import typing
|
|
6
|
+
from typing import Iterator, Optional, Tuple, Union
|
|
7
|
+
import urllib.parse
|
|
7
8
|
|
|
8
9
|
import colorama
|
|
9
10
|
|
|
@@ -13,25 +14,45 @@ from sky import exceptions
|
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky import skypilot_config
|
|
15
16
|
from sky import task as task_lib
|
|
17
|
+
from sky.server.requests import request_names
|
|
16
18
|
from sky.utils import common_utils
|
|
17
19
|
from sky.utils import config_utils
|
|
18
20
|
from sky.utils import ux_utils
|
|
19
21
|
|
|
20
22
|
logger = sky_logging.init_logger(__name__)
|
|
21
23
|
|
|
24
|
+
if typing.TYPE_CHECKING:
|
|
25
|
+
from sky import models
|
|
22
26
|
|
|
23
|
-
|
|
24
|
-
|
|
27
|
+
|
|
28
|
+
def _is_url(policy_string: str) -> bool:
|
|
29
|
+
"""Check if the policy string is a URL."""
|
|
30
|
+
try:
|
|
31
|
+
parsed = urllib.parse.urlparse(policy_string)
|
|
32
|
+
return parsed.scheme in ('http', 'https')
|
|
33
|
+
except Exception: # pylint: disable=broad-except
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_policy_impl(
|
|
38
|
+
policy_location: Optional[str]
|
|
39
|
+
) -> Optional[admin_policy.PolicyInterface]:
|
|
25
40
|
"""Gets admin-defined policy."""
|
|
26
|
-
if
|
|
41
|
+
if policy_location is None:
|
|
27
42
|
return None
|
|
43
|
+
|
|
44
|
+
if _is_url(policy_location):
|
|
45
|
+
# Use the built-in URL policy class when an URL is specified.
|
|
46
|
+
return admin_policy.RestfulAdminPolicy(policy_location)
|
|
47
|
+
|
|
48
|
+
# Handle module path format
|
|
28
49
|
try:
|
|
29
|
-
module_path, class_name =
|
|
50
|
+
module_path, class_name = policy_location.rsplit('.', 1)
|
|
30
51
|
module = importlib.import_module(module_path)
|
|
31
52
|
except ImportError as e:
|
|
32
53
|
with ux_utils.print_exception_no_traceback():
|
|
33
54
|
raise ImportError(
|
|
34
|
-
f'Failed to import policy module: {
|
|
55
|
+
f'Failed to import policy module: {policy_location}. '
|
|
35
56
|
'Please check if the module is installed in your Python '
|
|
36
57
|
'environment.') from e
|
|
37
58
|
|
|
@@ -43,19 +64,48 @@ def _get_policy_cls(
|
|
|
43
64
|
f'Could not find {class_name} class in module {module_path}. '
|
|
44
65
|
'Please check with your policy admin for details.') from e
|
|
45
66
|
|
|
46
|
-
#
|
|
67
|
+
# Currently we only allow users to define subclass of AdminPolicy
|
|
68
|
+
# instead of inheriting from PolicyInterface or PolicyTemplate.
|
|
47
69
|
if not issubclass(policy_cls, admin_policy.AdminPolicy):
|
|
48
70
|
with ux_utils.print_exception_no_traceback():
|
|
49
71
|
raise ValueError(
|
|
50
|
-
f'Policy class {
|
|
51
|
-
'interface. Please check with your policy admin
|
|
52
|
-
|
|
72
|
+
f'Policy class {policy_cls!r} does not implement the '
|
|
73
|
+
'AdminPolicy interface. Please check with your policy admin '
|
|
74
|
+
'for details.')
|
|
75
|
+
return policy_cls()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@contextlib.contextmanager
|
|
79
|
+
def apply_and_use_config_in_current_request(
|
|
80
|
+
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
|
81
|
+
request_name: request_names.AdminPolicyRequestName,
|
|
82
|
+
request_options: Optional[admin_policy.RequestOptions] = None,
|
|
83
|
+
at_client_side: bool = False,
|
|
84
|
+
) -> Iterator['dag_lib.Dag']:
|
|
85
|
+
"""Applies an admin policy and override SkyPilot config for current request
|
|
86
|
+
|
|
87
|
+
This is a helper function of `apply()` that applies an admin policy and
|
|
88
|
+
overrides the SkyPilot config for the current request as a context manager.
|
|
89
|
+
The original SkyPilot config will be restored when the context manager is
|
|
90
|
+
exited.
|
|
91
|
+
|
|
92
|
+
Refer to `apply()` for more details.
|
|
93
|
+
"""
|
|
94
|
+
original_config = skypilot_config.to_dict()
|
|
95
|
+
dag, mutated_config = apply(entrypoint, request_name, request_options,
|
|
96
|
+
at_client_side)
|
|
97
|
+
if mutated_config != original_config:
|
|
98
|
+
with skypilot_config.replace_skypilot_config(mutated_config):
|
|
99
|
+
yield dag
|
|
100
|
+
else:
|
|
101
|
+
yield dag
|
|
53
102
|
|
|
54
103
|
|
|
55
104
|
def apply(
|
|
56
105
|
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
|
57
|
-
|
|
106
|
+
request_name: request_names.AdminPolicyRequestName,
|
|
58
107
|
request_options: Optional[admin_policy.RequestOptions] = None,
|
|
108
|
+
at_client_side: bool = False,
|
|
59
109
|
) -> Tuple['dag_lib.Dag', config_utils.Config]:
|
|
60
110
|
"""Applies an admin policy (if registered) to a DAG or a task.
|
|
61
111
|
|
|
@@ -79,29 +129,41 @@ def apply(
|
|
|
79
129
|
else:
|
|
80
130
|
dag = entrypoint
|
|
81
131
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if
|
|
132
|
+
policy_location = skypilot_config.get_nested(('admin_policy',), None)
|
|
133
|
+
policy = _get_policy_impl(policy_location)
|
|
134
|
+
if policy is None:
|
|
85
135
|
return dag, skypilot_config.to_dict()
|
|
86
136
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
137
|
+
user = None
|
|
138
|
+
if at_client_side:
|
|
139
|
+
logger.info(f'Applying client admin policy: {policy}')
|
|
140
|
+
else:
|
|
141
|
+
# When being called by the server, the middleware has set the
|
|
142
|
+
# current user and this information is available at this point.
|
|
143
|
+
user = common_utils.get_current_user()
|
|
144
|
+
logger.info(f'Applying server admin policy: {policy}')
|
|
145
|
+
config = copy.deepcopy(skypilot_config.to_dict())
|
|
90
146
|
mutated_dag = dag_lib.Dag()
|
|
91
147
|
mutated_dag.name = dag.name
|
|
92
148
|
|
|
93
149
|
mutated_config = None
|
|
94
150
|
for task in dag.tasks:
|
|
95
|
-
user_request = admin_policy.UserRequest(task, config,
|
|
151
|
+
user_request = admin_policy.UserRequest(task, config, request_name,
|
|
152
|
+
request_options, at_client_side,
|
|
153
|
+
user)
|
|
96
154
|
try:
|
|
97
|
-
mutated_user_request =
|
|
155
|
+
mutated_user_request = policy.apply(user_request)
|
|
156
|
+
# Avoid duplicate exception wrapping.
|
|
157
|
+
except exceptions.UserRequestRejectedByPolicy as e:
|
|
158
|
+
with ux_utils.print_exception_no_traceback():
|
|
159
|
+
raise e
|
|
98
160
|
except Exception as e: # pylint: disable=broad-except
|
|
99
161
|
with ux_utils.print_exception_no_traceback():
|
|
100
162
|
raise exceptions.UserRequestRejectedByPolicy(
|
|
101
163
|
f'{colorama.Fore.RED}User request rejected by policy '
|
|
102
164
|
f'{policy!r}{colorama.Fore.RESET}: '
|
|
103
165
|
f'{common_utils.format_exception(e, use_bracket=True)}'
|
|
104
|
-
) from
|
|
166
|
+
) from None
|
|
105
167
|
if mutated_config is None:
|
|
106
168
|
mutated_config = mutated_user_request.skypilot_config
|
|
107
169
|
else:
|
|
@@ -126,22 +188,6 @@ def apply(
|
|
|
126
188
|
mutated_dag.graph.add_edge(mutated_dag.tasks[u_idx],
|
|
127
189
|
mutated_dag.tasks[v_idx])
|
|
128
190
|
|
|
129
|
-
if (use_mutated_config_in_current_request and
|
|
130
|
-
original_config != mutated_config):
|
|
131
|
-
with tempfile.NamedTemporaryFile(
|
|
132
|
-
delete=False,
|
|
133
|
-
mode='w',
|
|
134
|
-
prefix='policy-mutated-skypilot-config-',
|
|
135
|
-
suffix='.yaml') as temp_file:
|
|
136
|
-
|
|
137
|
-
common_utils.dump_yaml(temp_file.name, dict(**mutated_config))
|
|
138
|
-
os.environ[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = temp_file.name
|
|
139
|
-
logger.debug(f'Updated SkyPilot config: {temp_file.name}')
|
|
140
|
-
# TODO(zhwu): This is not a clean way to update the SkyPilot config,
|
|
141
|
-
# because we are resetting the global context for a single DAG,
|
|
142
|
-
# which is conceptually weird.
|
|
143
|
-
importlib.reload(skypilot_config)
|
|
144
|
-
|
|
145
191
|
logger.debug(f'Mutated user request: {mutated_user_request}')
|
|
146
192
|
mutated_dag.policy_applied = True
|
|
147
193
|
return mutated_dag, mutated_config
|
sky/utils/annotations.py
CHANGED
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
"""Annotations for public APIs."""
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
-
from typing import Callable, Literal
|
|
4
|
+
from typing import Callable, Literal, TypeVar
|
|
5
|
+
|
|
6
|
+
from typing_extensions import ParamSpec
|
|
5
7
|
|
|
6
8
|
# Whether the current process is a SkyPilot API server process.
|
|
7
9
|
is_on_api_server = True
|
|
8
|
-
|
|
10
|
+
_FUNCTIONS_NEED_RELOAD_CACHE = []
|
|
11
|
+
|
|
12
|
+
T = TypeVar('T')
|
|
13
|
+
P = ParamSpec('P')
|
|
9
14
|
|
|
10
15
|
|
|
11
|
-
def client_api(func):
|
|
16
|
+
def client_api(func: Callable[P, T]) -> Callable[P, T]:
|
|
12
17
|
"""Mark a function as a client-side API.
|
|
13
18
|
|
|
14
19
|
Code invoked by server-side functions will find annotations.is_on_api_server
|
|
@@ -38,14 +43,20 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
|
|
|
38
43
|
lru_cache_kwargs: Keyword arguments for functools.lru_cache.
|
|
39
44
|
"""
|
|
40
45
|
|
|
41
|
-
def decorator(func: Callable) -> Callable:
|
|
46
|
+
def decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
42
47
|
if scope == 'global':
|
|
43
48
|
return functools.lru_cache(*lru_cache_args,
|
|
44
49
|
**lru_cache_kwargs)(func)
|
|
45
50
|
else:
|
|
46
51
|
cached_func = functools.lru_cache(*lru_cache_args,
|
|
47
52
|
**lru_cache_kwargs)(func)
|
|
48
|
-
|
|
53
|
+
_FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
|
|
49
54
|
return cached_func
|
|
50
55
|
|
|
51
56
|
return decorator
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def clear_request_level_cache():
|
|
60
|
+
"""Clear the request-level cache."""
|
|
61
|
+
for func in _FUNCTIONS_NEED_RELOAD_CACHE:
|
|
62
|
+
func.cache_clear()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Asyncio utilities."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import functools
|
|
5
|
+
from typing import Set
|
|
6
|
+
|
|
7
|
+
_background_tasks: Set[asyncio.Task] = set()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def shield(func):
|
|
11
|
+
"""Shield the decorated async function from cancellation.
|
|
12
|
+
|
|
13
|
+
If the outter coroutine is cancelled, the inner decorated function
|
|
14
|
+
will be protected from cancellation by asyncio.shield(). And we will
|
|
15
|
+
maintain a reference to the the inner task to avoid it get GCed before
|
|
16
|
+
it is done.
|
|
17
|
+
|
|
18
|
+
For example, filelock.AsyncFileLock is not cancellation safe. The
|
|
19
|
+
following code:
|
|
20
|
+
|
|
21
|
+
async def fn_with_lock():
|
|
22
|
+
async with filelock.AsyncFileLock('lock'):
|
|
23
|
+
await asyncio.sleep(1)
|
|
24
|
+
|
|
25
|
+
is equivalent to:
|
|
26
|
+
|
|
27
|
+
# The lock may leak if the cancellation happens in
|
|
28
|
+
# lock.acquire() or lock.release()
|
|
29
|
+
async def fn_with_lock():
|
|
30
|
+
lock = filelock.AsyncFileLock('lock')
|
|
31
|
+
await lock.acquire()
|
|
32
|
+
try:
|
|
33
|
+
await asyncio.sleep(1)
|
|
34
|
+
finally:
|
|
35
|
+
await lock.release()
|
|
36
|
+
|
|
37
|
+
Shilding the function ensures there is no cancellation will happen in the
|
|
38
|
+
function, thus the lock will be released properly:
|
|
39
|
+
|
|
40
|
+
@shield
|
|
41
|
+
async def fn_with_lock()
|
|
42
|
+
|
|
43
|
+
Note that the resource acquisition and release should usually be protected
|
|
44
|
+
in one @shield block but not separately, e.g.:
|
|
45
|
+
|
|
46
|
+
lock = filelock.AsyncFileLock('lock')
|
|
47
|
+
|
|
48
|
+
@shield
|
|
49
|
+
async def acquire():
|
|
50
|
+
await lock.acquire()
|
|
51
|
+
|
|
52
|
+
@shield
|
|
53
|
+
async def release():
|
|
54
|
+
await lock.release()
|
|
55
|
+
|
|
56
|
+
async def fn_with_lock():
|
|
57
|
+
await acquire()
|
|
58
|
+
try:
|
|
59
|
+
do_something()
|
|
60
|
+
finally:
|
|
61
|
+
await release()
|
|
62
|
+
|
|
63
|
+
The above code is not safe because if `fn_with_lock` is cancelled,
|
|
64
|
+
`acquire()` and `release()` will be executed in the background
|
|
65
|
+
concurrently and causes race conditions.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
@functools.wraps(func)
|
|
69
|
+
async def async_wrapper(*args, **kwargs):
|
|
70
|
+
task = asyncio.create_task(func(*args, **kwargs))
|
|
71
|
+
try:
|
|
72
|
+
return await asyncio.shield(task)
|
|
73
|
+
except asyncio.CancelledError:
|
|
74
|
+
_background_tasks.add(task)
|
|
75
|
+
task.add_done_callback(lambda _: _background_tasks.discard(task))
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
return async_wrapper
|