skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import pickle
|
|
4
4
|
import typing
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import jobs as managed_jobs
|
|
8
8
|
from sky import models
|
|
@@ -56,10 +56,10 @@ def decode_status(
|
|
|
56
56
|
clusters = return_value
|
|
57
57
|
response = []
|
|
58
58
|
for cluster in clusters:
|
|
59
|
-
|
|
59
|
+
# handle may not always be present in the response.
|
|
60
|
+
if 'handle' in cluster and cluster['handle'] is not None:
|
|
61
|
+
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
60
62
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
61
|
-
cluster['storage_mounts_metadata'] = decode_and_unpickle(
|
|
62
|
-
cluster['storage_mounts_metadata'])
|
|
63
63
|
if 'is_managed' not in cluster:
|
|
64
64
|
cluster['is_managed'] = False
|
|
65
65
|
response.append(responses.StatusResponse.model_validate(cluster))
|
|
@@ -72,7 +72,7 @@ def decode_status_kubernetes(
|
|
|
72
72
|
List[Dict[str, Any]], Optional[str]]
|
|
73
73
|
) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
74
74
|
List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
75
|
-
List[
|
|
75
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
76
76
|
(encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
|
|
77
77
|
context) = return_value
|
|
78
78
|
all_clusters = []
|
|
@@ -85,6 +85,7 @@ def decode_status_kubernetes(
|
|
|
85
85
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
86
86
|
unmanaged_clusters.append(
|
|
87
87
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
|
|
88
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
88
89
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
89
90
|
|
|
90
91
|
|
|
@@ -101,29 +102,49 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
|
|
|
101
102
|
|
|
102
103
|
|
|
103
104
|
@register_decoders('queue')
|
|
104
|
-
def decode_queue(return_value: List[dict],) -> List[
|
|
105
|
+
def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
|
|
105
106
|
jobs = return_value
|
|
106
107
|
for job in jobs:
|
|
107
108
|
job['status'] = job_lib.JobStatus(job['status'])
|
|
108
|
-
return jobs
|
|
109
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
109
110
|
|
|
110
111
|
|
|
111
112
|
@register_decoders('jobs.queue')
|
|
112
|
-
def decode_jobs_queue(return_value):
|
|
113
|
+
def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
114
|
+
# To keep backward compatibility with v0.10.2
|
|
115
|
+
return decode_jobs_queue_v2(return_value)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@register_decoders('jobs.queue_v2')
|
|
119
|
+
def decode_jobs_queue_v2(
|
|
120
|
+
return_value
|
|
121
|
+
) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
|
|
122
|
+
List[responses.ManagedJobRecord]]:
|
|
113
123
|
"""Decode jobs queue response.
|
|
114
124
|
|
|
115
|
-
Supports legacy list, or a dict {jobs, total
|
|
116
|
-
|
|
125
|
+
Supports legacy list, or a dict {jobs, total, total_no_filter,
|
|
126
|
+
status_counts}.
|
|
127
|
+
|
|
128
|
+
- Returns either list[job] or tuple(list[job], total, status_counts,
|
|
129
|
+
total_no_filter)
|
|
117
130
|
"""
|
|
118
|
-
# Case 1: dict shape {jobs, total}
|
|
119
|
-
if isinstance(return_value, dict)
|
|
131
|
+
# Case 1: dict shape {jobs, total, total_no_filter, status_counts}
|
|
132
|
+
if isinstance(return_value, dict):
|
|
120
133
|
jobs = return_value.get('jobs', [])
|
|
134
|
+
total = return_value.get('total', len(jobs))
|
|
135
|
+
total_no_filter = return_value.get('total_no_filter', total)
|
|
136
|
+
status_counts = return_value.get('status_counts', {})
|
|
137
|
+
for job in jobs:
|
|
138
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
139
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
140
|
+
return jobs, total, status_counts, total_no_filter
|
|
121
141
|
else:
|
|
122
142
|
# Case 2: legacy list
|
|
123
143
|
jobs = return_value
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
144
|
+
for job in jobs:
|
|
145
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
146
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
147
|
+
return jobs
|
|
127
148
|
|
|
128
149
|
|
|
129
150
|
def _decode_serve_status(
|
|
@@ -175,14 +196,24 @@ def decode_list_accelerators(
|
|
|
175
196
|
|
|
176
197
|
@register_decoders('storage_ls')
|
|
177
198
|
def decode_storage_ls(
|
|
178
|
-
return_value: List[Dict[str, Any]]) -> List[
|
|
199
|
+
return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
|
|
179
200
|
for storage_info in return_value:
|
|
180
201
|
storage_info['status'] = status_lib.StorageStatus(
|
|
181
202
|
storage_info['status'])
|
|
182
203
|
storage_info['store'] = [
|
|
183
204
|
storage.StoreType(store) for store in storage_info['store']
|
|
184
205
|
]
|
|
185
|
-
return
|
|
206
|
+
return [
|
|
207
|
+
responses.StorageRecord(**storage_info) for storage_info in return_value
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@register_decoders('volume_list')
|
|
212
|
+
def decode_volume_list(
|
|
213
|
+
return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
|
|
214
|
+
return [
|
|
215
|
+
responses.VolumeRecord(**volume_info) for volume_info in return_value
|
|
216
|
+
]
|
|
186
217
|
|
|
187
218
|
|
|
188
219
|
@register_decoders('job_status')
|
|
@@ -6,8 +6,10 @@ import base64
|
|
|
6
6
|
import dataclasses
|
|
7
7
|
import pickle
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
|
+
from sky import models
|
|
12
|
+
from sky.catalog import common
|
|
11
13
|
from sky.schemas.api import responses
|
|
12
14
|
from sky.server import constants as server_constants
|
|
13
15
|
from sky.utils import serialize_utils
|
|
@@ -15,7 +17,6 @@ from sky.utils import serialize_utils
|
|
|
15
17
|
if typing.TYPE_CHECKING:
|
|
16
18
|
from sky import backends
|
|
17
19
|
from sky import clouds
|
|
18
|
-
from sky import models
|
|
19
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
20
21
|
|
|
21
22
|
handlers: Dict[str, Any] = {}
|
|
@@ -60,13 +61,23 @@ def encode_status(
|
|
|
60
61
|
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
61
62
|
response = []
|
|
62
63
|
for cluster in clusters:
|
|
63
|
-
response_cluster = cluster.model_dump()
|
|
64
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
65
|
+
# These default setting is needed because last_use and status_updated_at
|
|
66
|
+
# used to be not optional.
|
|
67
|
+
# TODO(syang): remove this after v0.12.0
|
|
68
|
+
if 'last_use' not in response_cluster:
|
|
69
|
+
response_cluster['last_use'] = ''
|
|
70
|
+
if 'status_updated_at' not in response_cluster:
|
|
71
|
+
response_cluster['status_updated_at'] = 0
|
|
64
72
|
response_cluster['status'] = cluster['status'].value
|
|
65
73
|
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
74
|
cluster['handle'])
|
|
67
75
|
response_cluster['handle'] = pickle_and_encode(handle)
|
|
76
|
+
# TODO (syang) We still need to return this field for backwards
|
|
77
|
+
# compatibility.
|
|
78
|
+
# Remove this field at or after v0.12.0
|
|
68
79
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
69
|
-
|
|
80
|
+
None) # Always returns None.
|
|
70
81
|
response.append(response_cluster)
|
|
71
82
|
return response
|
|
72
83
|
|
|
@@ -92,10 +103,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
|
92
103
|
|
|
93
104
|
|
|
94
105
|
@register_encoder('queue')
|
|
95
|
-
def encode_queue(
|
|
106
|
+
def encode_queue(
|
|
107
|
+
jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
|
|
108
|
+
response = []
|
|
96
109
|
for job in jobs:
|
|
97
|
-
|
|
98
|
-
|
|
110
|
+
response_job = job.model_dump()
|
|
111
|
+
response_job['status'] = job['status'].value
|
|
112
|
+
response.append(response_job)
|
|
113
|
+
return response
|
|
99
114
|
|
|
100
115
|
|
|
101
116
|
@register_encoder('status_kubernetes')
|
|
@@ -103,7 +118,7 @@ def encode_status_kubernetes(
|
|
|
103
118
|
return_value: Tuple[
|
|
104
119
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
105
120
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
106
|
-
List[
|
|
121
|
+
List[responses.ManagedJobRecord], Optional[str]]
|
|
107
122
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
|
|
108
123
|
Optional[str]]:
|
|
109
124
|
all_clusters, unmanaged_clusters, all_jobs, context = return_value
|
|
@@ -117,13 +132,22 @@ def encode_status_kubernetes(
|
|
|
117
132
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
118
133
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
119
134
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
135
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
120
136
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
121
137
|
|
|
122
138
|
|
|
123
139
|
@register_encoder('jobs.queue')
|
|
124
|
-
def encode_jobs_queue(
|
|
140
|
+
def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
|
|
141
|
+
for job in jobs:
|
|
142
|
+
job['status'] = job['status'].value
|
|
143
|
+
return jobs
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@register_encoder('jobs.queue_v2')
|
|
147
|
+
def encode_jobs_queue_v2(
|
|
148
|
+
jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
|
125
149
|
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
126
|
-
status_counts = {}
|
|
150
|
+
status_counts: Dict[str, int] = {}
|
|
127
151
|
if isinstance(jobs_or_tuple, tuple):
|
|
128
152
|
if len(jobs_or_tuple) == 2:
|
|
129
153
|
jobs, total = jobs_or_tuple
|
|
@@ -135,12 +159,13 @@ def encode_jobs_queue(jobs_or_tuple):
|
|
|
135
159
|
else:
|
|
136
160
|
jobs = jobs_or_tuple
|
|
137
161
|
total = None
|
|
138
|
-
for job in jobs
|
|
162
|
+
jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
|
|
163
|
+
for job in jobs_dict:
|
|
139
164
|
job['status'] = job['status'].value
|
|
140
165
|
if total is None:
|
|
141
|
-
return
|
|
166
|
+
return jobs_dict
|
|
142
167
|
return {
|
|
143
|
-
'jobs':
|
|
168
|
+
'jobs': jobs_dict,
|
|
144
169
|
'total': total,
|
|
145
170
|
'total_no_filter': total_no_filter,
|
|
146
171
|
'status_counts': status_counts
|
|
@@ -177,8 +202,9 @@ def encode_cost_report(
|
|
|
177
202
|
for cluster_report in cost_report:
|
|
178
203
|
if cluster_report['status'] is not None:
|
|
179
204
|
cluster_report['status'] = cluster_report['status'].value
|
|
180
|
-
|
|
181
|
-
cluster_report['resources']
|
|
205
|
+
if 'resources' in cluster_report:
|
|
206
|
+
cluster_report['resources'] = pickle_and_encode(
|
|
207
|
+
cluster_report['resources'])
|
|
182
208
|
return cost_report
|
|
183
209
|
|
|
184
210
|
|
|
@@ -190,19 +216,26 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
190
216
|
|
|
191
217
|
@register_encoder('storage_ls')
|
|
192
218
|
def encode_storage_ls(
|
|
193
|
-
return_value: List[
|
|
194
|
-
for storage_info in return_value
|
|
219
|
+
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
220
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
221
|
+
for storage_info in response_list:
|
|
195
222
|
storage_info['status'] = storage_info['status'].value
|
|
196
223
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
197
|
-
return
|
|
224
|
+
return response_list
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@register_encoder('volume_list')
|
|
228
|
+
def encode_volume_list(
|
|
229
|
+
return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
|
|
230
|
+
return [volume_info.model_dump() for volume_info in return_value]
|
|
198
231
|
|
|
199
232
|
|
|
200
233
|
@register_encoder('job_status')
|
|
201
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
234
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
202
235
|
for job_id in return_value.keys():
|
|
203
236
|
if return_value[job_id] is not None:
|
|
204
237
|
return_value[job_id] = return_value[job_id].value
|
|
205
|
-
return return_value
|
|
238
|
+
return {str(k): v for k, v in return_value.items()}
|
|
206
239
|
|
|
207
240
|
|
|
208
241
|
@register_encoder('kubernetes_node_info')
|
|
@@ -214,3 +247,52 @@ def encode_kubernetes_node_info(
|
|
|
214
247
|
@register_encoder('endpoints')
|
|
215
248
|
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
216
249
|
return {str(k): v for k, v in return_value.items()}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
253
|
+
def encode_realtime_gpu_availability(
|
|
254
|
+
return_value: List[Tuple[str,
|
|
255
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
256
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
257
|
+
# for JSON serialization.
|
|
258
|
+
encoded = []
|
|
259
|
+
for context, gpu_list in return_value:
|
|
260
|
+
converted_gpu_list = []
|
|
261
|
+
for gpu in gpu_list:
|
|
262
|
+
assert isinstance(gpu, models.RealtimeGpuAvailability), (
|
|
263
|
+
f'Expected RealtimeGpuAvailability, got {type(gpu)}')
|
|
264
|
+
converted_gpu_list.append(list(gpu))
|
|
265
|
+
encoded.append((context, converted_gpu_list))
|
|
266
|
+
return encoded
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@register_encoder('realtime_slurm_gpu_availability')
|
|
270
|
+
def encode_realtime_slurm_gpu_availability(
|
|
271
|
+
return_value: List[Tuple[str,
|
|
272
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
273
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
274
|
+
# for JSON serialization.
|
|
275
|
+
encoded = []
|
|
276
|
+
for context, gpu_list in return_value:
|
|
277
|
+
converted_gpu_list = []
|
|
278
|
+
for gpu in gpu_list:
|
|
279
|
+
assert isinstance(gpu, models.RealtimeGpuAvailability), (
|
|
280
|
+
f'Expected RealtimeGpuAvailability, got {type(gpu)}')
|
|
281
|
+
converted_gpu_list.append(list(gpu))
|
|
282
|
+
encoded.append((context, converted_gpu_list))
|
|
283
|
+
return encoded
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@register_encoder('list_accelerators')
|
|
287
|
+
def encode_list_accelerators(
|
|
288
|
+
return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
|
|
289
|
+
encoded: Dict[str, Any] = {}
|
|
290
|
+
for accelerator_name, instances in return_value.items():
|
|
291
|
+
# Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
|
|
292
|
+
converted_instances: List[Any] = []
|
|
293
|
+
for instance in instances:
|
|
294
|
+
assert isinstance(instance, common.InstanceTypeInfo), (
|
|
295
|
+
f'Expected InstanceTypeInfo, got {type(instance)}')
|
|
296
|
+
converted_instances.append(list(instance))
|
|
297
|
+
encoded[accelerator_name] = converted_instances
|
|
298
|
+
return encoded
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Version-aware serializers for request return values.
|
|
2
|
+
|
|
3
|
+
These serializers run at encode() time when remote_api_version is available,
|
|
4
|
+
to handle backward compatibility for old clients.
|
|
5
|
+
|
|
6
|
+
The existing encoders.py handles object -> dict conversion at set_return_value()
|
|
7
|
+
time. This module handles dict -> JSON string serialization at encode() time,
|
|
8
|
+
with version-aware field filtering for backward compatibility.
|
|
9
|
+
"""
|
|
10
|
+
from typing import Any, Callable, Dict
|
|
11
|
+
|
|
12
|
+
import orjson
|
|
13
|
+
|
|
14
|
+
from sky.server import constants as server_constants
|
|
15
|
+
from sky.server import versions
|
|
16
|
+
|
|
17
|
+
handlers: Dict[str, Callable[[Any], str]] = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def register_serializer(*names: str):
|
|
21
|
+
"""Decorator to register a version-aware serializer."""
|
|
22
|
+
|
|
23
|
+
def decorator(func):
|
|
24
|
+
for name in names:
|
|
25
|
+
if name != server_constants.DEFAULT_HANDLER_NAME:
|
|
26
|
+
name = server_constants.REQUEST_NAME_PREFIX + name
|
|
27
|
+
if name in handlers:
|
|
28
|
+
raise ValueError(f'Serializer {name} already registered: '
|
|
29
|
+
f'{handlers[name]}')
|
|
30
|
+
handlers[name] = func
|
|
31
|
+
return func
|
|
32
|
+
|
|
33
|
+
return decorator
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_serializer(name: str) -> Callable[[Any], str]:
|
|
37
|
+
"""Get the serializer for a request name."""
|
|
38
|
+
return handlers.get(name, handlers[server_constants.DEFAULT_HANDLER_NAME])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@register_serializer(server_constants.DEFAULT_HANDLER_NAME)
|
|
42
|
+
def default_serializer(return_value: Any) -> str:
|
|
43
|
+
"""The default serializer."""
|
|
44
|
+
return orjson.dumps(return_value).decode('utf-8')
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@register_serializer('kubernetes_node_info')
|
|
48
|
+
def serialize_kubernetes_node_info(return_value: Dict[str, Any]) -> str:
|
|
49
|
+
"""Serialize kubernetes node info with version compatibility.
|
|
50
|
+
|
|
51
|
+
The is_ready field was added in API version 25. Remove it for old clients
|
|
52
|
+
that don't recognize it.
|
|
53
|
+
"""
|
|
54
|
+
remote_api_version = versions.get_remote_api_version()
|
|
55
|
+
if (return_value and remote_api_version is not None and
|
|
56
|
+
remote_api_version < 25):
|
|
57
|
+
# Remove is_ready field for old clients that don't recognize it
|
|
58
|
+
for node_info in return_value.get('node_info_dict', {}).values():
|
|
59
|
+
node_info.pop('is_ready', None)
|
|
60
|
+
return orjson.dumps(return_value).decode('utf-8')
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Request execution threads management."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Callable, Set, TypeVar
|
|
7
|
+
|
|
8
|
+
from sky import exceptions
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.utils import atomic
|
|
11
|
+
|
|
12
|
+
# pylint: disable=ungrouped-imports
|
|
13
|
+
if sys.version_info >= (3, 10):
|
|
14
|
+
from typing import ParamSpec
|
|
15
|
+
else:
|
|
16
|
+
from typing_extensions import ParamSpec
|
|
17
|
+
|
|
18
|
+
_P = ParamSpec('_P')
|
|
19
|
+
_T = TypeVar('_T')
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OnDemandThreadExecutor(concurrent.futures.Executor):
|
|
25
|
+
"""An executor that creates a new thread for each task and destroys it
|
|
26
|
+
after the task is completed.
|
|
27
|
+
|
|
28
|
+
Note(dev):
|
|
29
|
+
We raise an error instead of queuing the request if the limit is reached, so
|
|
30
|
+
that:
|
|
31
|
+
1. the request might be handled by other processes that have idle workers
|
|
32
|
+
upon retry;
|
|
33
|
+
2. if not, then users can be clearly hinted that they need to scale the API
|
|
34
|
+
server to support higher concurrency.
|
|
35
|
+
So this executor is only suitable for carefully selected cases where the
|
|
36
|
+
error can be properly handled by caller. To make this executor general, we
|
|
37
|
+
need to support configuring the queuing behavior (exception or queueing).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, name: str, max_workers: int):
|
|
41
|
+
self.name: str = name
|
|
42
|
+
self.max_workers: int = max_workers
|
|
43
|
+
self.running: atomic.AtomicInt = atomic.AtomicInt(0)
|
|
44
|
+
self._shutdown: bool = False
|
|
45
|
+
self._shutdown_lock: threading.Lock = threading.Lock()
|
|
46
|
+
self._threads: Set[threading.Thread] = set()
|
|
47
|
+
self._threads_lock: threading.Lock = threading.Lock()
|
|
48
|
+
|
|
49
|
+
def _cleanup_thread(self, thread: threading.Thread):
|
|
50
|
+
with self._threads_lock:
|
|
51
|
+
self._threads.discard(thread)
|
|
52
|
+
|
|
53
|
+
def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
|
|
54
|
+
*args, **kwargs):
|
|
55
|
+
try:
|
|
56
|
+
result = fn(*args, **kwargs)
|
|
57
|
+
fut.set_result(result)
|
|
58
|
+
except Exception as e: # pylint: disable=broad-except
|
|
59
|
+
logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
|
|
60
|
+
fut.set_exception(e)
|
|
61
|
+
finally:
|
|
62
|
+
self.running.decrement()
|
|
63
|
+
self._cleanup_thread(threading.current_thread())
|
|
64
|
+
|
|
65
|
+
def check_available(self, borrow: bool = False) -> int:
|
|
66
|
+
"""Check if there are available workers.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
borrow: If True, the caller borrow a worker from the executor.
|
|
70
|
+
The caller is responsible for returning the worker to the
|
|
71
|
+
executor after the task is completed.
|
|
72
|
+
"""
|
|
73
|
+
count = self.running.increment()
|
|
74
|
+
if count > self.max_workers:
|
|
75
|
+
self.running.decrement()
|
|
76
|
+
raise exceptions.ConcurrentWorkerExhaustedError(
|
|
77
|
+
f'Maximum concurrent workers {self.max_workers} of threads '
|
|
78
|
+
f'executor [{self.name}] reached')
|
|
79
|
+
if not borrow:
|
|
80
|
+
self.running.decrement()
|
|
81
|
+
return count
|
|
82
|
+
|
|
83
|
+
def submit(self, fn: Callable[_P, _T], *args: _P.args,
|
|
84
|
+
**kwargs: _P.kwargs) -> 'concurrent.futures.Future[_T]':
|
|
85
|
+
with self._shutdown_lock:
|
|
86
|
+
if self._shutdown:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
'Cannot submit task after executor is shutdown')
|
|
89
|
+
count = self.check_available(borrow=True)
|
|
90
|
+
fut: concurrent.futures.Future = concurrent.futures.Future()
|
|
91
|
+
# Name is assigned for debugging purpose, duplication is fine
|
|
92
|
+
thread = threading.Thread(target=self._task_wrapper,
|
|
93
|
+
name=f'{self.name}-{count}',
|
|
94
|
+
args=(fn, fut, *args),
|
|
95
|
+
kwargs=kwargs,
|
|
96
|
+
daemon=True)
|
|
97
|
+
with self._threads_lock:
|
|
98
|
+
self._threads.add(thread)
|
|
99
|
+
try:
|
|
100
|
+
thread.start()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
self.running.decrement()
|
|
103
|
+
self._cleanup_thread(thread)
|
|
104
|
+
fut.set_exception(e)
|
|
105
|
+
raise
|
|
106
|
+
assert thread.ident is not None, 'Thread should be started'
|
|
107
|
+
return fut
|
|
108
|
+
|
|
109
|
+
def shutdown(self, wait=True):
|
|
110
|
+
with self._shutdown_lock:
|
|
111
|
+
self._shutdown = True
|
|
112
|
+
if not wait:
|
|
113
|
+
return
|
|
114
|
+
with self._threads_lock:
|
|
115
|
+
threads = list(self._threads)
|
|
116
|
+
for t in threads:
|
|
117
|
+
t.join()
|