skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/server/server.py
CHANGED
|
@@ -10,6 +10,7 @@ from sky.server import common as server_common
|
|
|
10
10
|
from sky.server import stream_utils
|
|
11
11
|
from sky.server.requests import executor
|
|
12
12
|
from sky.server.requests import payloads
|
|
13
|
+
from sky.server.requests import request_names
|
|
13
14
|
from sky.server.requests import requests as api_requests
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.utils import common
|
|
@@ -23,9 +24,9 @@ async def up(
|
|
|
23
24
|
request: fastapi.Request,
|
|
24
25
|
up_body: payloads.ServeUpBody,
|
|
25
26
|
) -> None:
|
|
26
|
-
executor.
|
|
27
|
+
await executor.schedule_request_async(
|
|
27
28
|
request_id=request.state.request_id,
|
|
28
|
-
request_name=
|
|
29
|
+
request_name=request_names.RequestName.SERVE_UP,
|
|
29
30
|
request_body=up_body,
|
|
30
31
|
func=core.up,
|
|
31
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -38,9 +39,9 @@ async def update(
|
|
|
38
39
|
request: fastapi.Request,
|
|
39
40
|
update_body: payloads.ServeUpdateBody,
|
|
40
41
|
) -> None:
|
|
41
|
-
executor.
|
|
42
|
+
await executor.schedule_request_async(
|
|
42
43
|
request_id=request.state.request_id,
|
|
43
|
-
request_name=
|
|
44
|
+
request_name=request_names.RequestName.SERVE_UPDATE,
|
|
44
45
|
request_body=update_body,
|
|
45
46
|
func=core.update,
|
|
46
47
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -53,9 +54,9 @@ async def down(
|
|
|
53
54
|
request: fastapi.Request,
|
|
54
55
|
down_body: payloads.ServeDownBody,
|
|
55
56
|
) -> None:
|
|
56
|
-
executor.
|
|
57
|
+
await executor.schedule_request_async(
|
|
57
58
|
request_id=request.state.request_id,
|
|
58
|
-
request_name=
|
|
59
|
+
request_name=request_names.RequestName.SERVE_DOWN,
|
|
59
60
|
request_body=down_body,
|
|
60
61
|
func=core.down,
|
|
61
62
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -68,9 +69,9 @@ async def terminate_replica(
|
|
|
68
69
|
request: fastapi.Request,
|
|
69
70
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
71
|
) -> None:
|
|
71
|
-
executor.
|
|
72
|
+
await executor.schedule_request_async(
|
|
72
73
|
request_id=request.state.request_id,
|
|
73
|
-
request_name=
|
|
74
|
+
request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
|
|
74
75
|
request_body=terminate_replica_body,
|
|
75
76
|
func=core.terminate_replica,
|
|
76
77
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -83,9 +84,9 @@ async def status(
|
|
|
83
84
|
request: fastapi.Request,
|
|
84
85
|
status_body: payloads.ServeStatusBody,
|
|
85
86
|
) -> None:
|
|
86
|
-
executor.
|
|
87
|
+
await executor.schedule_request_async(
|
|
87
88
|
request_id=request.state.request_id,
|
|
88
|
-
request_name=
|
|
89
|
+
request_name=request_names.RequestName.SERVE_STATUS,
|
|
89
90
|
request_body=status_body,
|
|
90
91
|
func=core.status,
|
|
91
92
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -98,21 +99,23 @@ async def tail_logs(
|
|
|
98
99
|
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
|
99
100
|
background_tasks: fastapi.BackgroundTasks
|
|
100
101
|
) -> fastapi.responses.StreamingResponse:
|
|
101
|
-
executor.
|
|
102
|
+
executor.check_request_thread_executor_available()
|
|
103
|
+
request_task = await executor.prepare_request_async(
|
|
102
104
|
request_id=request.state.request_id,
|
|
103
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SERVE_LOGS,
|
|
104
106
|
request_body=log_body,
|
|
105
107
|
func=core.tail_logs,
|
|
106
108
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
107
109
|
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
108
110
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
return stream_utils.
|
|
111
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
112
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
113
|
+
background_tasks.add_task(task.cancel)
|
|
114
|
+
return stream_utils.stream_response_for_long_request(
|
|
113
115
|
request_id=request_task.request_id,
|
|
114
116
|
logs_path=request_task.log_path,
|
|
115
117
|
background_tasks=background_tasks,
|
|
118
|
+
kill_request_on_disconnect=False,
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
|
|
@@ -130,9 +133,9 @@ async def download_logs(
|
|
|
130
133
|
# We should reuse the original request body, so that the env vars, such as
|
|
131
134
|
# user hash, are kept the same.
|
|
132
135
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
133
|
-
executor.
|
|
136
|
+
await executor.schedule_request_async(
|
|
134
137
|
request_id=request.state.request_id,
|
|
135
|
-
request_name=
|
|
138
|
+
request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
|
|
136
139
|
request_body=download_logs_body,
|
|
137
140
|
func=core.sync_down_logs,
|
|
138
141
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/serve/service.py
CHANGED
|
@@ -13,12 +13,13 @@ from typing import Dict
|
|
|
13
13
|
|
|
14
14
|
import filelock
|
|
15
15
|
|
|
16
|
-
from sky import authentication
|
|
17
16
|
from sky import exceptions
|
|
17
|
+
from sky import global_user_state
|
|
18
18
|
from sky import sky_logging
|
|
19
19
|
from sky import task as task_lib
|
|
20
20
|
from sky.backends import backend_utils
|
|
21
21
|
from sky.backends import cloud_vm_ray_backend
|
|
22
|
+
from sky.data import data_utils
|
|
22
23
|
from sky.serve import constants
|
|
23
24
|
from sky.serve import controller
|
|
24
25
|
from sky.serve import load_balancer
|
|
@@ -26,8 +27,11 @@ from sky.serve import replica_managers
|
|
|
26
27
|
from sky.serve import serve_state
|
|
27
28
|
from sky.serve import serve_utils
|
|
28
29
|
from sky.skylet import constants as skylet_constants
|
|
30
|
+
from sky.utils import auth_utils
|
|
29
31
|
from sky.utils import common_utils
|
|
32
|
+
from sky.utils import controller_utils
|
|
30
33
|
from sky.utils import subprocess_utils
|
|
34
|
+
from sky.utils import thread_utils
|
|
31
35
|
from sky.utils import ux_utils
|
|
32
36
|
|
|
33
37
|
# Use the explicit logger name so that the logger is under the
|
|
@@ -62,17 +66,19 @@ def _handle_signal(service_name: str) -> None:
|
|
|
62
66
|
raise error_type(f'User signal received: {user_signal.value}')
|
|
63
67
|
|
|
64
68
|
|
|
65
|
-
def cleanup_storage(
|
|
69
|
+
def cleanup_storage(yaml_content: str) -> bool:
|
|
66
70
|
"""Clean up the storage for the service.
|
|
67
71
|
|
|
68
72
|
Args:
|
|
69
|
-
|
|
73
|
+
yaml_content: The yaml content of the service.
|
|
70
74
|
|
|
71
75
|
Returns:
|
|
72
76
|
True if the storage is cleaned up successfully, False otherwise.
|
|
73
77
|
"""
|
|
78
|
+
failed = False
|
|
79
|
+
|
|
74
80
|
try:
|
|
75
|
-
task = task_lib.Task.
|
|
81
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
76
82
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
77
83
|
# Need to re-construct storage object in the controller process
|
|
78
84
|
# because when SkyPilot API server machine sends the yaml config to the
|
|
@@ -86,54 +92,125 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
86
92
|
f'{common_utils.format_exception(e)}')
|
|
87
93
|
with ux_utils.enable_traceback():
|
|
88
94
|
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
89
|
-
|
|
90
|
-
|
|
95
|
+
failed = True
|
|
96
|
+
|
|
97
|
+
# Clean up any files mounted from the local disk, such as two-hop file
|
|
98
|
+
# mounts.
|
|
99
|
+
for file_mount in (task.file_mounts or {}).values():
|
|
100
|
+
try:
|
|
101
|
+
if not data_utils.is_cloud_store_url(file_mount):
|
|
102
|
+
path = os.path.expanduser(file_mount)
|
|
103
|
+
if os.path.isdir(path):
|
|
104
|
+
shutil.rmtree(path)
|
|
105
|
+
else:
|
|
106
|
+
os.remove(path)
|
|
107
|
+
except Exception as e: # pylint: disable=broad-except
|
|
108
|
+
logger.error(f'Failed to clean up file mount {file_mount}: {e}')
|
|
109
|
+
with ux_utils.enable_traceback():
|
|
110
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
111
|
+
failed = True
|
|
112
|
+
|
|
113
|
+
return not failed
|
|
91
114
|
|
|
92
115
|
|
|
93
|
-
|
|
116
|
+
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
117
|
+
# because we killed all the processes (controller & replica manager) before
|
|
118
|
+
# calling this function.
|
|
119
|
+
def _cleanup(service_name: str, pool: bool) -> bool:
|
|
94
120
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
121
|
+
# Cleanup the HA recovery script first as it is possible that some error
|
|
122
|
+
# was raised when we construct the task object (e.g.,
|
|
123
|
+
# sky.exceptions.ResourcesUnavailableError).
|
|
124
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
95
125
|
failed = False
|
|
96
126
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
97
|
-
|
|
98
|
-
|
|
127
|
+
info2thr: Dict[replica_managers.ReplicaInfo,
|
|
128
|
+
thread_utils.SafeThread] = dict()
|
|
129
|
+
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
130
|
+
# generate_replica_cluster_name`. Change it if you change the function.
|
|
131
|
+
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
132
|
+
service_name)
|
|
99
133
|
for info in replica_infos:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
134
|
+
if info.cluster_name not in existing_cluster_names:
|
|
135
|
+
logger.info(f'Cluster {info.cluster_name} for replica '
|
|
136
|
+
f'{info.replica_id} not found. Might be a failed '
|
|
137
|
+
'cluster. Skipping.')
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
log_file_name = serve_utils.generate_replica_log_file_name(
|
|
141
|
+
service_name, info.replica_id)
|
|
142
|
+
t = thread_utils.SafeThread(target=replica_managers.terminate_cluster,
|
|
143
|
+
args=(info.cluster_name, log_file_name))
|
|
144
|
+
info2thr[info] = t
|
|
104
145
|
# Set replica status to `SHUTTING_DOWN`
|
|
105
146
|
info.status_property.sky_launch_status = (
|
|
106
|
-
replica_managers.ProcessStatus.SUCCEEDED)
|
|
147
|
+
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
107
148
|
info.status_property.sky_down_status = (
|
|
108
|
-
replica_managers.ProcessStatus.
|
|
149
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED)
|
|
109
150
|
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
110
|
-
logger.info(f'
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
151
|
+
logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
|
|
152
|
+
|
|
153
|
+
def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
|
|
154
|
+
nonlocal failed
|
|
155
|
+
# Set replica status to `FAILED_CLEANUP`
|
|
156
|
+
info.status_property.sky_down_status = (
|
|
157
|
+
replica_managers.common_utils.ProcessStatus.FAILED)
|
|
158
|
+
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
159
|
+
failed = True
|
|
160
|
+
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
|
161
|
+
|
|
162
|
+
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
163
|
+
# TODO(tian): Refactor to use the same logic and code.
|
|
164
|
+
while info2thr:
|
|
165
|
+
snapshot = list(info2thr.items())
|
|
166
|
+
for info, t in snapshot:
|
|
167
|
+
if t.is_alive():
|
|
168
|
+
continue
|
|
169
|
+
if (info.status_property.sky_down_status ==
|
|
170
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
171
|
+
if controller_utils.can_terminate(pool):
|
|
172
|
+
try:
|
|
173
|
+
t.start()
|
|
174
|
+
except Exception as e: # pylint: disable=broad-except
|
|
175
|
+
_set_to_failed_cleanup(info)
|
|
176
|
+
logger.error(f'Failed to start thread for replica '
|
|
177
|
+
f'{info.replica_id}: {e}')
|
|
178
|
+
del info2thr[info]
|
|
179
|
+
else:
|
|
180
|
+
info.status_property.sky_down_status = (
|
|
181
|
+
common_utils.ProcessStatus.RUNNING)
|
|
182
|
+
serve_state.add_or_update_replica(
|
|
183
|
+
service_name, info.replica_id, info)
|
|
184
|
+
else:
|
|
185
|
+
logger.info('Terminate thread for replica '
|
|
186
|
+
f'{info.replica_id} finished.')
|
|
187
|
+
t.join()
|
|
188
|
+
del info2thr[info]
|
|
189
|
+
if t.format_exc is None:
|
|
190
|
+
serve_state.remove_replica(service_name, info.replica_id)
|
|
191
|
+
logger.info(
|
|
192
|
+
f'Replica {info.replica_id} terminated successfully.')
|
|
193
|
+
else:
|
|
194
|
+
_set_to_failed_cleanup(info)
|
|
195
|
+
time.sleep(3)
|
|
126
196
|
|
|
127
197
|
def cleanup_version_storage(version: int) -> bool:
|
|
128
|
-
|
|
129
|
-
|
|
198
|
+
yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
199
|
+
if yaml_content is None:
|
|
200
|
+
logger.warning(f'No yaml content found for version {version}')
|
|
201
|
+
return True
|
|
130
202
|
logger.info(f'Cleaning up storage for version {version}, '
|
|
131
|
-
f'
|
|
132
|
-
return cleanup_storage(
|
|
203
|
+
f'yaml_content: {yaml_content}')
|
|
204
|
+
return cleanup_storage(yaml_content)
|
|
133
205
|
|
|
206
|
+
versions = serve_state.get_service_versions(service_name)
|
|
134
207
|
if not all(map(cleanup_version_storage, versions)):
|
|
135
208
|
failed = True
|
|
136
209
|
|
|
210
|
+
# Cleanup version metadata after all storages are cleaned up, otherwise
|
|
211
|
+
# the get_yaml_content will return None as all versions are deleted.
|
|
212
|
+
serve_state.delete_all_versions(service_name)
|
|
213
|
+
|
|
137
214
|
return failed
|
|
138
215
|
|
|
139
216
|
|
|
@@ -152,73 +229,79 @@ def _cleanup_task_run_script(job_id: int) -> None:
|
|
|
152
229
|
logger.warning(f'Task run script {this_task_run_script} not found')
|
|
153
230
|
|
|
154
231
|
|
|
155
|
-
def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
232
|
+
def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
156
233
|
"""Starts the service.
|
|
157
234
|
This including the controller and load balancer.
|
|
158
235
|
"""
|
|
159
236
|
# Generate ssh key pair to avoid race condition when multiple sky.launch
|
|
160
237
|
# are executed at the same time.
|
|
161
|
-
|
|
238
|
+
auth_utils.get_or_generate_keys()
|
|
162
239
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
# Already checked before submit to controller.
|
|
166
|
-
assert task.service is not None, task
|
|
167
|
-
service_spec = task.service
|
|
168
|
-
|
|
169
|
-
def is_recovery_mode(service_name: str) -> bool:
|
|
170
|
-
"""Check if service exists in database to determine recovery mode.
|
|
171
|
-
"""
|
|
172
|
-
service = serve_state.get_service_from_name(service_name)
|
|
173
|
-
return service is not None
|
|
174
|
-
|
|
175
|
-
is_recovery = is_recovery_mode(service_name)
|
|
240
|
+
service = serve_state.get_service_from_name(service_name)
|
|
241
|
+
is_recovery = service is not None
|
|
176
242
|
logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
|
|
177
243
|
|
|
244
|
+
def _read_yaml_content(yaml_path: str) -> str:
|
|
245
|
+
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
246
|
+
return f.read()
|
|
247
|
+
|
|
178
248
|
if is_recovery:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
249
|
+
yaml_content = service['yaml_content']
|
|
250
|
+
# Backward compatibility for old service records that
|
|
251
|
+
# does not dump the yaml content to version database.
|
|
252
|
+
# TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
|
|
253
|
+
if yaml_content is None:
|
|
254
|
+
yaml_content = _read_yaml_content(tmp_task_yaml)
|
|
182
255
|
else:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
256
|
+
yaml_content = _read_yaml_content(tmp_task_yaml)
|
|
257
|
+
|
|
258
|
+
# Initialize database record for the service.
|
|
259
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
260
|
+
# Already checked before submit to controller.
|
|
261
|
+
assert task.service is not None, task
|
|
262
|
+
service_spec = task.service
|
|
186
263
|
|
|
187
264
|
service_dir = os.path.expanduser(
|
|
188
265
|
serve_utils.generate_remote_service_dir_name(service_name))
|
|
189
|
-
task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
|
|
190
266
|
|
|
191
267
|
if not is_recovery:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
268
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
269
|
+
if not controller_utils.can_start_new_process(task.service.pool):
|
|
270
|
+
cleanup_storage(yaml_content)
|
|
271
|
+
with ux_utils.print_exception_no_traceback():
|
|
272
|
+
raise RuntimeError(
|
|
273
|
+
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
274
|
+
success = serve_state.add_service(
|
|
275
|
+
service_name,
|
|
276
|
+
controller_job_id=job_id,
|
|
277
|
+
policy=service_spec.autoscaling_policy_str(),
|
|
278
|
+
requested_resources_str=backend_utils.get_task_resources_str(
|
|
279
|
+
task),
|
|
280
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
|
281
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
282
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
283
|
+
pool=service_spec.pool,
|
|
284
|
+
controller_pid=os.getpid(),
|
|
285
|
+
entrypoint=entrypoint)
|
|
205
286
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
206
287
|
# for more details.
|
|
207
288
|
if not success:
|
|
208
|
-
cleanup_storage(
|
|
289
|
+
cleanup_storage(yaml_content)
|
|
209
290
|
with ux_utils.print_exception_no_traceback():
|
|
210
291
|
raise ValueError(f'Service {service_name} already exists.')
|
|
211
292
|
|
|
212
293
|
# Create the service working directory.
|
|
213
294
|
os.makedirs(service_dir, exist_ok=True)
|
|
214
295
|
|
|
215
|
-
|
|
216
|
-
#
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
296
|
+
version = constants.INITIAL_VERSION
|
|
297
|
+
# Add initial version information to the service state.
|
|
298
|
+
serve_state.add_or_update_version(service_name, version, service_spec,
|
|
299
|
+
yaml_content)
|
|
300
|
+
else:
|
|
301
|
+
version = serve_state.get_latest_version(service_name)
|
|
302
|
+
if version is None:
|
|
303
|
+
raise ValueError(f'No version found for service {service_name}')
|
|
304
|
+
serve_state.update_service_controller_pid(service_name, os.getpid())
|
|
222
305
|
|
|
223
306
|
controller_process = None
|
|
224
307
|
load_balancer_process = None
|
|
@@ -249,7 +332,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
249
332
|
controller_host = _get_controller_host()
|
|
250
333
|
controller_process = multiprocessing.Process(
|
|
251
334
|
target=controller.run_controller,
|
|
252
|
-
args=(service_name, service_spec,
|
|
335
|
+
args=(service_name, service_spec, version, controller_host,
|
|
253
336
|
controller_port))
|
|
254
337
|
controller_process.start()
|
|
255
338
|
|
|
@@ -271,14 +354,18 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
271
354
|
# TODO(tian): Probably we could enable multiple ports specified in
|
|
272
355
|
# service spec and we could start multiple load balancers.
|
|
273
356
|
# After that, we will have a mapping from replica port to endpoint.
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
357
|
+
# NOTE(tian): We don't need the load balancer for pool.
|
|
358
|
+
# Skip the load balancer process for pool.
|
|
359
|
+
if not service_spec.pool:
|
|
360
|
+
load_balancer_process = multiprocessing.Process(
|
|
361
|
+
target=ux_utils.RedirectOutputForProcess(
|
|
362
|
+
load_balancer.run_load_balancer,
|
|
363
|
+
load_balancer_log_file).run,
|
|
364
|
+
args=(controller_addr, load_balancer_port,
|
|
365
|
+
service_spec.load_balancing_policy,
|
|
366
|
+
service_spec.tls_credential,
|
|
367
|
+
service_spec.target_qps_per_replica))
|
|
368
|
+
load_balancer_process.start()
|
|
282
369
|
|
|
283
370
|
if not is_recovery:
|
|
284
371
|
serve_state.set_service_load_balancer_port(
|
|
@@ -303,7 +390,19 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
303
390
|
for process in process_to_kill:
|
|
304
391
|
process.join()
|
|
305
392
|
|
|
306
|
-
|
|
393
|
+
# Catch any exception here to avoid it kill the service monitoring
|
|
394
|
+
# process. In which case, the service will not only fail to clean
|
|
395
|
+
# up, but also cannot be terminated in the future as no process
|
|
396
|
+
# will handle the user signal anymore. Instead, we catch any error
|
|
397
|
+
# and set it to FAILED_CLEANUP instead.
|
|
398
|
+
try:
|
|
399
|
+
failed = _cleanup(service_name, service_spec.pool)
|
|
400
|
+
except Exception as e: # pylint: disable=broad-except
|
|
401
|
+
logger.error(f'Failed to clean up service {service_name}: {e}')
|
|
402
|
+
with ux_utils.enable_traceback():
|
|
403
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
404
|
+
failed = True
|
|
405
|
+
|
|
307
406
|
if failed:
|
|
308
407
|
serve_state.set_service_status_and_active_versions(
|
|
309
408
|
service_name, serve_state.ServiceStatus.FAILED_CLEANUP)
|
|
@@ -333,8 +432,12 @@ if __name__ == '__main__':
|
|
|
333
432
|
required=True,
|
|
334
433
|
type=int,
|
|
335
434
|
help='Job id for the service job.')
|
|
435
|
+
parser.add_argument('--entrypoint',
|
|
436
|
+
type=str,
|
|
437
|
+
help='Entrypoint to launch the service',
|
|
438
|
+
required=True)
|
|
336
439
|
args = parser.parse_args()
|
|
337
440
|
# We start process with 'spawn', because 'fork' could result in weird
|
|
338
441
|
# behaviors; 'spawn' is also cross-platform.
|
|
339
442
|
multiprocessing.set_start_method('spawn', force=True)
|
|
340
|
-
_start(args.service_name, args.task_yaml, args.job_id)
|
|
443
|
+
_start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)
|