skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,17 +2,17 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import pickle
|
|
4
4
|
import typing
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import jobs as managed_jobs
|
|
8
8
|
from sky import models
|
|
9
|
-
from sky.
|
|
9
|
+
from sky.catalog import common
|
|
10
10
|
from sky.data import storage
|
|
11
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.schemas.api import responses
|
|
12
13
|
from sky.serve import serve_state
|
|
13
14
|
from sky.server import constants as server_constants
|
|
14
15
|
from sky.skylet import job_lib
|
|
15
|
-
from sky.utils import registry
|
|
16
16
|
from sky.utils import status_lib
|
|
17
17
|
|
|
18
18
|
if typing.TYPE_CHECKING:
|
|
@@ -51,13 +51,19 @@ def default_decode_handler(return_value: Any) -> Any:
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@register_decoders('status')
|
|
54
|
-
def decode_status(
|
|
54
|
+
def decode_status(
|
|
55
|
+
return_value: List[Dict[str, Any]]) -> List[responses.StatusResponse]:
|
|
55
56
|
clusters = return_value
|
|
57
|
+
response = []
|
|
56
58
|
for cluster in clusters:
|
|
57
|
-
|
|
59
|
+
# handle may not always be present in the response.
|
|
60
|
+
if 'handle' in cluster and cluster['handle'] is not None:
|
|
61
|
+
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
58
62
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
59
|
-
|
|
60
|
-
|
|
63
|
+
if 'is_managed' not in cluster:
|
|
64
|
+
cluster['is_managed'] = False
|
|
65
|
+
response.append(responses.StatusResponse.model_validate(cluster))
|
|
66
|
+
return response
|
|
61
67
|
|
|
62
68
|
|
|
63
69
|
@register_decoders('status_kubernetes')
|
|
@@ -66,7 +72,7 @@ def decode_status_kubernetes(
|
|
|
66
72
|
List[Dict[str, Any]], Optional[str]]
|
|
67
73
|
) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
68
74
|
List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
69
|
-
List[
|
|
75
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
70
76
|
(encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
|
|
71
77
|
context) = return_value
|
|
72
78
|
all_clusters = []
|
|
@@ -79,6 +85,7 @@ def decode_status_kubernetes(
|
|
|
79
85
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
80
86
|
unmanaged_clusters.append(
|
|
81
87
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
|
|
88
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
82
89
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
83
90
|
|
|
84
91
|
|
|
@@ -95,24 +102,53 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
|
|
|
95
102
|
|
|
96
103
|
|
|
97
104
|
@register_decoders('queue')
|
|
98
|
-
def decode_queue(return_value: List[dict],) -> List[
|
|
105
|
+
def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
|
|
99
106
|
jobs = return_value
|
|
100
107
|
for job in jobs:
|
|
101
108
|
job['status'] = job_lib.JobStatus(job['status'])
|
|
102
|
-
return jobs
|
|
109
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
103
110
|
|
|
104
111
|
|
|
105
112
|
@register_decoders('jobs.queue')
|
|
106
113
|
def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
# To keep backward compatibility with v0.10.2
|
|
115
|
+
return decode_jobs_queue_v2(return_value)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@register_decoders('jobs.queue_v2')
|
|
119
|
+
def decode_jobs_queue_v2(
|
|
120
|
+
return_value
|
|
121
|
+
) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
|
|
122
|
+
List[responses.ManagedJobRecord]]:
|
|
123
|
+
"""Decode jobs queue response.
|
|
124
|
+
|
|
125
|
+
Supports legacy list, or a dict {jobs, total, total_no_filter,
|
|
126
|
+
status_counts}.
|
|
127
|
+
|
|
128
|
+
- Returns either list[job] or tuple(list[job], total, status_counts,
|
|
129
|
+
total_no_filter)
|
|
130
|
+
"""
|
|
131
|
+
# Case 1: dict shape {jobs, total, total_no_filter, status_counts}
|
|
132
|
+
if isinstance(return_value, dict):
|
|
133
|
+
jobs = return_value.get('jobs', [])
|
|
134
|
+
total = return_value.get('total', len(jobs))
|
|
135
|
+
total_no_filter = return_value.get('total_no_filter', total)
|
|
136
|
+
status_counts = return_value.get('status_counts', {})
|
|
137
|
+
for job in jobs:
|
|
138
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
139
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
140
|
+
return jobs, total, status_counts, total_no_filter
|
|
141
|
+
else:
|
|
142
|
+
# Case 2: legacy list
|
|
143
|
+
jobs = return_value
|
|
144
|
+
for job in jobs:
|
|
145
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
146
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
147
|
+
return jobs
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _decode_serve_status(
|
|
151
|
+
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
116
152
|
for service_status in service_statuses:
|
|
117
153
|
service_status['status'] = serve_state.ServiceStatus(
|
|
118
154
|
service_status['status'])
|
|
@@ -123,6 +159,16 @@ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
|
123
159
|
return service_statuses
|
|
124
160
|
|
|
125
161
|
|
|
162
|
+
@register_decoders('serve.status')
|
|
163
|
+
def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
164
|
+
return _decode_serve_status(return_value)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@register_decoders('jobs.pool_status')
|
|
168
|
+
def decode_jobs_pool_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
169
|
+
return _decode_serve_status(return_value)
|
|
170
|
+
|
|
171
|
+
|
|
126
172
|
@register_decoders('cost_report')
|
|
127
173
|
def decode_cost_report(
|
|
128
174
|
return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
@@ -135,16 +181,6 @@ def decode_cost_report(
|
|
|
135
181
|
return return_value
|
|
136
182
|
|
|
137
183
|
|
|
138
|
-
@register_decoders('enabled_clouds')
|
|
139
|
-
def decode_enabled_clouds(return_value: List[str]) -> List['clouds.Cloud']:
|
|
140
|
-
clouds = []
|
|
141
|
-
for cloud_name in return_value:
|
|
142
|
-
cloud = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
|
143
|
-
assert cloud is not None, return_value
|
|
144
|
-
clouds.append(cloud)
|
|
145
|
-
return clouds
|
|
146
|
-
|
|
147
|
-
|
|
148
184
|
@register_decoders('list_accelerators')
|
|
149
185
|
def decode_list_accelerators(
|
|
150
186
|
return_value: Dict[str, List[List[Any]]]
|
|
@@ -160,14 +196,24 @@ def decode_list_accelerators(
|
|
|
160
196
|
|
|
161
197
|
@register_decoders('storage_ls')
|
|
162
198
|
def decode_storage_ls(
|
|
163
|
-
return_value: List[Dict[str, Any]]) -> List[
|
|
199
|
+
return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
|
|
164
200
|
for storage_info in return_value:
|
|
165
201
|
storage_info['status'] = status_lib.StorageStatus(
|
|
166
202
|
storage_info['status'])
|
|
167
203
|
storage_info['store'] = [
|
|
168
204
|
storage.StoreType(store) for store in storage_info['store']
|
|
169
205
|
]
|
|
170
|
-
return
|
|
206
|
+
return [
|
|
207
|
+
responses.StorageRecord(**storage_info) for storage_info in return_value
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@register_decoders('volume_list')
|
|
212
|
+
def decode_volume_list(
|
|
213
|
+
return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
|
|
214
|
+
return [
|
|
215
|
+
responses.VolumeRecord(**volume_info) for volume_info in return_value
|
|
216
|
+
]
|
|
171
217
|
|
|
172
218
|
|
|
173
219
|
@register_decoders('job_status')
|
|
@@ -190,3 +236,8 @@ def decode_job_status(
|
|
|
190
236
|
def decode_kubernetes_node_info(
|
|
191
237
|
return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
|
|
192
238
|
return models.KubernetesNodesInfo.from_dict(return_value)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@register_decoders('endpoints')
|
|
242
|
+
def decode_endpoints(return_value: Dict[int, str]) -> Dict[int, str]:
|
|
243
|
+
return {int(k): v for k, v in return_value.items()}
|
|
@@ -6,14 +6,17 @@ import base64
|
|
|
6
6
|
import dataclasses
|
|
7
7
|
import pickle
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
|
+
from sky import models
|
|
12
|
+
from sky.catalog import common
|
|
13
|
+
from sky.schemas.api import responses
|
|
11
14
|
from sky.server import constants as server_constants
|
|
15
|
+
from sky.utils import serialize_utils
|
|
12
16
|
|
|
13
17
|
if typing.TYPE_CHECKING:
|
|
14
18
|
from sky import backends
|
|
15
19
|
from sky import clouds
|
|
16
|
-
from sky import models
|
|
17
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
18
21
|
|
|
19
22
|
handlers: Dict[str, Any] = {}
|
|
@@ -21,6 +24,9 @@ handlers: Dict[str, Any] = {}
|
|
|
21
24
|
|
|
22
25
|
def pickle_and_encode(obj: Any) -> str:
|
|
23
26
|
try:
|
|
27
|
+
# Apply backwards compatibility processing at the lowest level
|
|
28
|
+
# to catch any handles that might have bypassed the encoders
|
|
29
|
+
obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
|
|
24
30
|
return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
|
|
25
31
|
except TypeError as e:
|
|
26
32
|
raise ValueError(f'Failed to pickle object: {obj}') from e
|
|
@@ -51,13 +57,29 @@ def default_encoder(return_value: Any) -> Any:
|
|
|
51
57
|
|
|
52
58
|
|
|
53
59
|
@register_encoder('status')
|
|
54
|
-
def encode_status(
|
|
60
|
+
def encode_status(
|
|
61
|
+
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
62
|
+
response = []
|
|
55
63
|
for cluster in clusters:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
64
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
65
|
+
# These default setting is needed because last_use and status_updated_at
|
|
66
|
+
# used to be not optional.
|
|
67
|
+
# TODO(syang): remove this after v0.12.0
|
|
68
|
+
if 'last_use' not in response_cluster:
|
|
69
|
+
response_cluster['last_use'] = ''
|
|
70
|
+
if 'status_updated_at' not in response_cluster:
|
|
71
|
+
response_cluster['status_updated_at'] = 0
|
|
72
|
+
response_cluster['status'] = cluster['status'].value
|
|
73
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
74
|
+
cluster['handle'])
|
|
75
|
+
response_cluster['handle'] = pickle_and_encode(handle)
|
|
76
|
+
# TODO (syang) We still need to return this field for backwards
|
|
77
|
+
# compatibility.
|
|
78
|
+
# Remove this field at or after v0.12.0
|
|
79
|
+
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
80
|
+
None) # Always returns None.
|
|
81
|
+
response.append(response_cluster)
|
|
82
|
+
return response
|
|
61
83
|
|
|
62
84
|
|
|
63
85
|
@register_encoder('launch', 'exec', 'jobs.launch')
|
|
@@ -65,6 +87,7 @@ def encode_launch(
|
|
|
65
87
|
job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
|
|
66
88
|
) -> Dict[str, Any]:
|
|
67
89
|
job_id, handle = job_id_handle
|
|
90
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
|
|
68
91
|
return {
|
|
69
92
|
'job_id': job_id,
|
|
70
93
|
'handle': pickle_and_encode(handle),
|
|
@@ -73,14 +96,21 @@ def encode_launch(
|
|
|
73
96
|
|
|
74
97
|
@register_encoder('start')
|
|
75
98
|
def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
99
|
+
resource_handle = (
|
|
100
|
+
serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
101
|
+
resource_handle))
|
|
76
102
|
return pickle_and_encode(resource_handle)
|
|
77
103
|
|
|
78
104
|
|
|
79
105
|
@register_encoder('queue')
|
|
80
|
-
def encode_queue(
|
|
106
|
+
def encode_queue(
|
|
107
|
+
jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
|
|
108
|
+
response = []
|
|
81
109
|
for job in jobs:
|
|
82
|
-
|
|
83
|
-
|
|
110
|
+
response_job = job.model_dump()
|
|
111
|
+
response_job['status'] = job['status'].value
|
|
112
|
+
response.append(response_job)
|
|
113
|
+
return response
|
|
84
114
|
|
|
85
115
|
|
|
86
116
|
@register_encoder('status_kubernetes')
|
|
@@ -88,7 +118,7 @@ def encode_status_kubernetes(
|
|
|
88
118
|
return_value: Tuple[
|
|
89
119
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
90
120
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
91
|
-
List[
|
|
121
|
+
List[responses.ManagedJobRecord], Optional[str]]
|
|
92
122
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
|
|
93
123
|
Optional[str]]:
|
|
94
124
|
all_clusters, unmanaged_clusters, all_jobs, context = return_value
|
|
@@ -102,6 +132,7 @@ def encode_status_kubernetes(
|
|
|
102
132
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
103
133
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
104
134
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
135
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
105
136
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
106
137
|
|
|
107
138
|
|
|
@@ -112,25 +143,68 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
|
|
|
112
143
|
return jobs
|
|
113
144
|
|
|
114
145
|
|
|
115
|
-
@register_encoder('
|
|
116
|
-
def
|
|
146
|
+
@register_encoder('jobs.queue_v2')
|
|
147
|
+
def encode_jobs_queue_v2(
|
|
148
|
+
jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
|
149
|
+
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
150
|
+
status_counts: Dict[str, int] = {}
|
|
151
|
+
if isinstance(jobs_or_tuple, tuple):
|
|
152
|
+
if len(jobs_or_tuple) == 2:
|
|
153
|
+
jobs, total = jobs_or_tuple
|
|
154
|
+
total_no_filter = total
|
|
155
|
+
elif len(jobs_or_tuple) == 4:
|
|
156
|
+
jobs, total, status_counts, total_no_filter = jobs_or_tuple
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
|
|
159
|
+
else:
|
|
160
|
+
jobs = jobs_or_tuple
|
|
161
|
+
total = None
|
|
162
|
+
jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
|
|
163
|
+
for job in jobs_dict:
|
|
164
|
+
job['status'] = job['status'].value
|
|
165
|
+
if total is None:
|
|
166
|
+
return jobs_dict
|
|
167
|
+
return {
|
|
168
|
+
'jobs': jobs_dict,
|
|
169
|
+
'total': total,
|
|
170
|
+
'total_no_filter': total_no_filter,
|
|
171
|
+
'status_counts': status_counts
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _encode_serve_status(
|
|
117
176
|
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
118
177
|
for service_status in service_statuses:
|
|
119
178
|
service_status['status'] = service_status['status'].value
|
|
120
179
|
for replica_info in service_status.get('replica_info', []):
|
|
121
180
|
replica_info['status'] = replica_info['status'].value
|
|
122
|
-
|
|
181
|
+
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
182
|
+
replica_info['handle'])
|
|
183
|
+
replica_info['handle'] = pickle_and_encode(handle)
|
|
123
184
|
return service_statuses
|
|
124
185
|
|
|
125
186
|
|
|
187
|
+
@register_encoder('serve.status')
|
|
188
|
+
def encode_serve_status(
|
|
189
|
+
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
190
|
+
return _encode_serve_status(service_statuses)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@register_encoder('jobs.pool_status')
|
|
194
|
+
def encode_jobs_pool_status(
|
|
195
|
+
pool_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
196
|
+
return _encode_serve_status(pool_statuses)
|
|
197
|
+
|
|
198
|
+
|
|
126
199
|
@register_encoder('cost_report')
|
|
127
200
|
def encode_cost_report(
|
|
128
201
|
cost_report: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
129
202
|
for cluster_report in cost_report:
|
|
130
203
|
if cluster_report['status'] is not None:
|
|
131
204
|
cluster_report['status'] = cluster_report['status'].value
|
|
132
|
-
|
|
133
|
-
cluster_report['resources']
|
|
205
|
+
if 'resources' in cluster_report:
|
|
206
|
+
cluster_report['resources'] = pickle_and_encode(
|
|
207
|
+
cluster_report['resources'])
|
|
134
208
|
return cost_report
|
|
135
209
|
|
|
136
210
|
|
|
@@ -142,22 +216,66 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
142
216
|
|
|
143
217
|
@register_encoder('storage_ls')
|
|
144
218
|
def encode_storage_ls(
|
|
145
|
-
return_value: List[
|
|
146
|
-
for storage_info in return_value
|
|
219
|
+
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
220
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
221
|
+
for storage_info in response_list:
|
|
147
222
|
storage_info['status'] = storage_info['status'].value
|
|
148
223
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
149
|
-
return
|
|
224
|
+
return response_list
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@register_encoder('volume_list')
|
|
228
|
+
def encode_volume_list(
|
|
229
|
+
return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
|
|
230
|
+
return [volume_info.model_dump() for volume_info in return_value]
|
|
150
231
|
|
|
151
232
|
|
|
152
233
|
@register_encoder('job_status')
|
|
153
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
234
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
154
235
|
for job_id in return_value.keys():
|
|
155
236
|
if return_value[job_id] is not None:
|
|
156
237
|
return_value[job_id] = return_value[job_id].value
|
|
157
|
-
return return_value
|
|
238
|
+
return {str(k): v for k, v in return_value.items()}
|
|
158
239
|
|
|
159
240
|
|
|
160
241
|
@register_encoder('kubernetes_node_info')
|
|
161
242
|
def encode_kubernetes_node_info(
|
|
162
243
|
return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
|
|
163
244
|
return return_value.to_dict()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@register_encoder('endpoints')
|
|
248
|
+
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
249
|
+
return {str(k): v for k, v in return_value.items()}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
253
|
+
def encode_realtime_gpu_availability(
|
|
254
|
+
return_value: List[Tuple[str,
|
|
255
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
256
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
257
|
+
# for JSON serialization.
|
|
258
|
+
encoded = []
|
|
259
|
+
for context, gpu_list in return_value:
|
|
260
|
+
converted_gpu_list = []
|
|
261
|
+
for gpu in gpu_list:
|
|
262
|
+
assert isinstance(gpu, models.RealtimeGpuAvailability), (
|
|
263
|
+
f'Expected RealtimeGpuAvailability, got {type(gpu)}')
|
|
264
|
+
converted_gpu_list.append(list(gpu))
|
|
265
|
+
encoded.append((context, converted_gpu_list))
|
|
266
|
+
return encoded
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@register_encoder('list_accelerators')
|
|
270
|
+
def encode_list_accelerators(
|
|
271
|
+
return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
|
|
272
|
+
encoded: Dict[str, Any] = {}
|
|
273
|
+
for accelerator_name, instances in return_value.items():
|
|
274
|
+
# Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
|
|
275
|
+
converted_instances: List[Any] = []
|
|
276
|
+
for instance in instances:
|
|
277
|
+
assert isinstance(instance, common.InstanceTypeInfo), (
|
|
278
|
+
f'Expected InstanceTypeInfo, got {type(instance)}')
|
|
279
|
+
converted_instances.append(list(instance))
|
|
280
|
+
encoded[accelerator_name] = converted_instances
|
|
281
|
+
return encoded
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Request execution threads management."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Callable, Set, TypeVar
|
|
7
|
+
|
|
8
|
+
from sky import exceptions
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.utils import atomic
|
|
11
|
+
|
|
12
|
+
# pylint: disable=ungrouped-imports
|
|
13
|
+
if sys.version_info >= (3, 10):
|
|
14
|
+
from typing import ParamSpec
|
|
15
|
+
else:
|
|
16
|
+
from typing_extensions import ParamSpec
|
|
17
|
+
|
|
18
|
+
_P = ParamSpec('_P')
|
|
19
|
+
_T = TypeVar('_T')
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OnDemandThreadExecutor(concurrent.futures.Executor):
|
|
25
|
+
"""An executor that creates a new thread for each task and destroys it
|
|
26
|
+
after the task is completed.
|
|
27
|
+
|
|
28
|
+
Note(dev):
|
|
29
|
+
We raise an error instead of queuing the request if the limit is reached, so
|
|
30
|
+
that:
|
|
31
|
+
1. the request might be handled by other processes that have idle workers
|
|
32
|
+
upon retry;
|
|
33
|
+
2. if not, then users can be clearly hinted that they need to scale the API
|
|
34
|
+
server to support higher concurrency.
|
|
35
|
+
So this executor is only suitable for carefully selected cases where the
|
|
36
|
+
error can be properly handled by caller. To make this executor general, we
|
|
37
|
+
need to support configuring the queuing behavior (exception or queueing).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, name: str, max_workers: int):
|
|
41
|
+
self.name: str = name
|
|
42
|
+
self.max_workers: int = max_workers
|
|
43
|
+
self.running: atomic.AtomicInt = atomic.AtomicInt(0)
|
|
44
|
+
self._shutdown: bool = False
|
|
45
|
+
self._shutdown_lock: threading.Lock = threading.Lock()
|
|
46
|
+
self._threads: Set[threading.Thread] = set()
|
|
47
|
+
self._threads_lock: threading.Lock = threading.Lock()
|
|
48
|
+
|
|
49
|
+
def _cleanup_thread(self, thread: threading.Thread):
|
|
50
|
+
with self._threads_lock:
|
|
51
|
+
self._threads.discard(thread)
|
|
52
|
+
|
|
53
|
+
def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
|
|
54
|
+
*args, **kwargs):
|
|
55
|
+
try:
|
|
56
|
+
result = fn(*args, **kwargs)
|
|
57
|
+
fut.set_result(result)
|
|
58
|
+
except Exception as e: # pylint: disable=broad-except
|
|
59
|
+
logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
|
|
60
|
+
fut.set_exception(e)
|
|
61
|
+
finally:
|
|
62
|
+
self.running.decrement()
|
|
63
|
+
self._cleanup_thread(threading.current_thread())
|
|
64
|
+
|
|
65
|
+
def check_available(self, borrow: bool = False) -> int:
|
|
66
|
+
"""Check if there are available workers.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
borrow: If True, the caller borrow a worker from the executor.
|
|
70
|
+
The caller is responsible for returning the worker to the
|
|
71
|
+
executor after the task is completed.
|
|
72
|
+
"""
|
|
73
|
+
count = self.running.increment()
|
|
74
|
+
if count > self.max_workers:
|
|
75
|
+
self.running.decrement()
|
|
76
|
+
raise exceptions.ConcurrentWorkerExhaustedError(
|
|
77
|
+
f'Maximum concurrent workers {self.max_workers} of threads '
|
|
78
|
+
f'executor [{self.name}] reached')
|
|
79
|
+
if not borrow:
|
|
80
|
+
self.running.decrement()
|
|
81
|
+
return count
|
|
82
|
+
|
|
83
|
+
def submit(self, fn: Callable[_P, _T], *args: _P.args,
|
|
84
|
+
**kwargs: _P.kwargs) -> 'concurrent.futures.Future[_T]':
|
|
85
|
+
with self._shutdown_lock:
|
|
86
|
+
if self._shutdown:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
'Cannot submit task after executor is shutdown')
|
|
89
|
+
count = self.check_available(borrow=True)
|
|
90
|
+
fut: concurrent.futures.Future = concurrent.futures.Future()
|
|
91
|
+
# Name is assigned for debugging purpose, duplication is fine
|
|
92
|
+
thread = threading.Thread(target=self._task_wrapper,
|
|
93
|
+
name=f'{self.name}-{count}',
|
|
94
|
+
args=(fn, fut, *args),
|
|
95
|
+
kwargs=kwargs,
|
|
96
|
+
daemon=True)
|
|
97
|
+
with self._threads_lock:
|
|
98
|
+
self._threads.add(thread)
|
|
99
|
+
try:
|
|
100
|
+
thread.start()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
self.running.decrement()
|
|
103
|
+
self._cleanup_thread(thread)
|
|
104
|
+
fut.set_exception(e)
|
|
105
|
+
raise
|
|
106
|
+
assert thread.ident is not None, 'Thread should be started'
|
|
107
|
+
return fut
|
|
108
|
+
|
|
109
|
+
def shutdown(self, wait=True):
|
|
110
|
+
with self._shutdown_lock:
|
|
111
|
+
self._shutdown = True
|
|
112
|
+
if not wait:
|
|
113
|
+
return
|
|
114
|
+
with self._threads_lock:
|
|
115
|
+
threads = list(self._threads)
|
|
116
|
+
for t in threads:
|
|
117
|
+
t.join()
|