skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import collections
|
|
4
4
|
import dataclasses
|
|
5
|
+
import datetime
|
|
5
6
|
import enum
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
@@ -11,9 +12,10 @@ import shlex
|
|
|
11
12
|
import shutil
|
|
12
13
|
import threading
|
|
13
14
|
import time
|
|
15
|
+
import traceback
|
|
14
16
|
import typing
|
|
15
|
-
from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator,
|
|
16
|
-
Optional, TextIO, Type, TypeVar, Union)
|
|
17
|
+
from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
|
|
18
|
+
List, Optional, TextIO, Type, TypeVar, Union)
|
|
17
19
|
import uuid
|
|
18
20
|
|
|
19
21
|
import colorama
|
|
@@ -22,19 +24,25 @@ import filelock
|
|
|
22
24
|
from sky import backends
|
|
23
25
|
from sky import exceptions
|
|
24
26
|
from sky import global_user_state
|
|
27
|
+
from sky import sky_logging
|
|
28
|
+
from sky import skypilot_config
|
|
25
29
|
from sky.adaptors import common as adaptors_common
|
|
30
|
+
from sky.jobs import state as managed_job_state
|
|
26
31
|
from sky.serve import constants
|
|
27
32
|
from sky.serve import serve_state
|
|
28
33
|
from sky.serve import spot_placer
|
|
29
34
|
from sky.skylet import constants as skylet_constants
|
|
30
35
|
from sky.skylet import job_lib
|
|
31
36
|
from sky.utils import annotations
|
|
37
|
+
from sky.utils import command_runner
|
|
32
38
|
from sky.utils import common_utils
|
|
39
|
+
from sky.utils import controller_utils
|
|
33
40
|
from sky.utils import log_utils
|
|
34
41
|
from sky.utils import message_utils
|
|
35
42
|
from sky.utils import resources_utils
|
|
36
43
|
from sky.utils import status_lib
|
|
37
44
|
from sky.utils import ux_utils
|
|
45
|
+
from sky.utils import yaml_utils
|
|
38
46
|
|
|
39
47
|
if typing.TYPE_CHECKING:
|
|
40
48
|
import fastapi
|
|
@@ -47,23 +55,19 @@ else:
|
|
|
47
55
|
psutil = adaptors_common.LazyImport('psutil')
|
|
48
56
|
requests = adaptors_common.LazyImport('requests')
|
|
49
57
|
|
|
50
|
-
|
|
51
|
-
@annotations.lru_cache(scope='request')
|
|
52
|
-
def get_num_service_threshold():
|
|
53
|
-
"""Get number of services threshold, calculating it only when needed."""
|
|
54
|
-
system_memory_gb = psutil.virtual_memory().total // (1024**3)
|
|
55
|
-
return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
|
|
56
|
-
|
|
58
|
+
logger = sky_logging.init_logger(__name__)
|
|
57
59
|
|
|
58
60
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
59
61
|
|
|
60
|
-
# NOTE(dev): We assume log
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
fr'
|
|
66
|
-
|
|
62
|
+
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
63
|
+
# when changing UX as this assumption is used to expand some log files while
|
|
64
|
+
# ignoring others.
|
|
65
|
+
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
66
|
+
_SKYPILOT_PROVISION_API_LOG_PATTERN = (
|
|
67
|
+
fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
68
|
+
# New hint pattern for provision logs
|
|
69
|
+
_SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
|
|
70
|
+
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
67
71
|
|
|
68
72
|
# TODO(tian): Find all existing replica id and print here.
|
|
69
73
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
|
@@ -244,7 +248,123 @@ class RequestTimestamp(RequestsAggregator):
|
|
|
244
248
|
return f'RequestTimestamp(timestamps={self.timestamps})'
|
|
245
249
|
|
|
246
250
|
|
|
247
|
-
def
|
|
251
|
+
def get_service_filelock_path(pool: str) -> str:
|
|
252
|
+
path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
|
|
253
|
+
'pool.lock').expanduser().absolute()
|
|
254
|
+
path.parents[0].mkdir(parents=True, exist_ok=True)
|
|
255
|
+
return str(path)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
259
|
+
pool: bool) -> None:
|
|
260
|
+
"""Validate the consolidation mode config."""
|
|
261
|
+
# Check whether the consolidation mode config is changed.
|
|
262
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
263
|
+
if current_is_consolidation_mode:
|
|
264
|
+
controller_cn = controller.cluster_name
|
|
265
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
266
|
+
with ux_utils.print_exception_no_traceback():
|
|
267
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
268
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
269
|
+
f'{controller.controller_type} is enabled, but the '
|
|
270
|
+
f'controller cluster {controller_cn} is still running. '
|
|
271
|
+
'Please terminate the controller cluster first.'
|
|
272
|
+
f'{colorama.Style.RESET_ALL}')
|
|
273
|
+
else:
|
|
274
|
+
noun = 'pool' if pool else 'service'
|
|
275
|
+
all_services = [
|
|
276
|
+
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
277
|
+
]
|
|
278
|
+
if all_services:
|
|
279
|
+
with ux_utils.print_exception_no_traceback():
|
|
280
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
281
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
282
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
283
|
+
f'still {len(all_services)} {noun}s running. Please '
|
|
284
|
+
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
|
288
|
+
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
289
|
+
# Use jobs config for pool consolidation mode.
|
|
290
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
291
|
+
consolidation_mode = skypilot_config.get_nested(
|
|
292
|
+
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
293
|
+
default_value=False)
|
|
294
|
+
# We should only do this check on API server, as the controller will not
|
|
295
|
+
# have related config and will always seemingly disabled for consolidation
|
|
296
|
+
# mode. Check #6611 for more details.
|
|
297
|
+
if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
|
|
298
|
+
and controller.controller_type == 'jobs'):
|
|
299
|
+
# if we are in the job controller, we must always be in consolidation
|
|
300
|
+
# mode.
|
|
301
|
+
return True
|
|
302
|
+
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
303
|
+
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
304
|
+
return consolidation_mode
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def ha_recovery_for_consolidation_mode(pool: bool):
|
|
308
|
+
"""Recovery logic for HA mode."""
|
|
309
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
|
310
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
|
311
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
312
|
+
runner = command_runner.LocalProcessCommandRunner()
|
|
313
|
+
noun = 'pool' if pool else 'serve'
|
|
314
|
+
capnoun = noun.capitalize()
|
|
315
|
+
prefix = f'{noun}_'
|
|
316
|
+
with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
|
|
317
|
+
'w',
|
|
318
|
+
encoding='utf-8') as f:
|
|
319
|
+
start = time.time()
|
|
320
|
+
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
|
321
|
+
for service_name in serve_state.get_glob_service_names(None):
|
|
322
|
+
svc = _get_service_status(service_name,
|
|
323
|
+
pool=pool,
|
|
324
|
+
with_replica_info=False)
|
|
325
|
+
if svc is None:
|
|
326
|
+
continue
|
|
327
|
+
controller_pid = svc['controller_pid']
|
|
328
|
+
if controller_pid is not None:
|
|
329
|
+
try:
|
|
330
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
331
|
+
f.write(f'Controller pid {controller_pid} for '
|
|
332
|
+
f'{noun} {service_name} is still running. '
|
|
333
|
+
'Skipping recovery.\n')
|
|
334
|
+
continue
|
|
335
|
+
except Exception: # pylint: disable=broad-except
|
|
336
|
+
# _controller_process_alive may raise if psutil fails; we
|
|
337
|
+
# should not crash the recovery logic because of this.
|
|
338
|
+
f.write('Error checking controller pid '
|
|
339
|
+
f'{controller_pid} for {noun} {service_name}\n')
|
|
340
|
+
|
|
341
|
+
script = serve_state.get_ha_recovery_script(service_name)
|
|
342
|
+
if script is None:
|
|
343
|
+
f.write(f'{capnoun} {service_name}\'s recovery script does '
|
|
344
|
+
'not exist. Skipping recovery.\n')
|
|
345
|
+
continue
|
|
346
|
+
rc, out, err = runner.run(script, require_outputs=True)
|
|
347
|
+
if rc:
|
|
348
|
+
f.write(f'Recovery script returned {rc}. '
|
|
349
|
+
f'Output: {out}\nError: {err}\n')
|
|
350
|
+
f.write(f'{capnoun} {service_name} completed recovery at '
|
|
351
|
+
f'{datetime.datetime.now()}\n')
|
|
352
|
+
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
353
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _controller_process_alive(pid: int, service_name: str) -> bool:
|
|
357
|
+
"""Check if the controller process is alive."""
|
|
358
|
+
try:
|
|
359
|
+
process = psutil.Process(pid)
|
|
360
|
+
cmd_str = ' '.join(process.cmdline())
|
|
361
|
+
return process.is_running(
|
|
362
|
+
) and f'--service-name {service_name}' in cmd_str
|
|
363
|
+
except psutil.NoSuchProcess:
|
|
364
|
+
return False
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
248
368
|
"""Validate the task for Sky Serve.
|
|
249
369
|
|
|
250
370
|
Args:
|
|
@@ -267,19 +387,43 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
267
387
|
'use `dynamic_ondemand_fallback` or set '
|
|
268
388
|
'base_ondemand_fallback_replicas.')
|
|
269
389
|
|
|
390
|
+
field_name = 'service' if not pool else 'pool'
|
|
270
391
|
if task.service is None:
|
|
271
392
|
with ux_utils.print_exception_no_traceback():
|
|
272
|
-
raise RuntimeError('
|
|
393
|
+
raise RuntimeError(f'{field_name.capitalize()} section not found.')
|
|
394
|
+
|
|
395
|
+
if pool != task.service.pool:
|
|
396
|
+
with ux_utils.print_exception_no_traceback():
|
|
397
|
+
raise ValueError(f'{field_name.capitalize()} section in the YAML '
|
|
398
|
+
f'file does not match the pool argument. '
|
|
399
|
+
f'To fix, add a valid `{field_name}` field.')
|
|
273
400
|
|
|
274
401
|
policy_description = ('on-demand'
|
|
275
402
|
if task.service.dynamic_ondemand_fallback else 'spot')
|
|
276
403
|
for resource in list(task.resources):
|
|
277
404
|
if resource.job_recovery is not None:
|
|
405
|
+
sys_name = 'SkyServe' if not pool else 'Cluster Pool'
|
|
278
406
|
with ux_utils.print_exception_no_traceback():
|
|
279
|
-
raise ValueError('job_recovery is disabled for
|
|
280
|
-
'
|
|
407
|
+
raise ValueError(f'job_recovery is disabled for {sys_name}. '
|
|
408
|
+
f'{sys_name} will replenish preempted spot '
|
|
281
409
|
f'with {policy_description} instances.')
|
|
282
410
|
|
|
411
|
+
if pool:
|
|
412
|
+
accelerators = set()
|
|
413
|
+
for resource in task.resources:
|
|
414
|
+
if resource.accelerators is not None:
|
|
415
|
+
if isinstance(resource.accelerators, str):
|
|
416
|
+
accelerators.add(resource.accelerators)
|
|
417
|
+
elif isinstance(resource.accelerators, dict):
|
|
418
|
+
accelerators.update(resource.accelerators.keys())
|
|
419
|
+
elif isinstance(resource.accelerators, list):
|
|
420
|
+
accelerators.update(resource.accelerators)
|
|
421
|
+
if len(accelerators) > 1:
|
|
422
|
+
with ux_utils.print_exception_no_traceback():
|
|
423
|
+
raise ValueError('Heterogeneous clusters are not supported for '
|
|
424
|
+
'cluster pools please specify one accelerator '
|
|
425
|
+
'for all workers.')
|
|
426
|
+
|
|
283
427
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
284
428
|
# is valid for spot placer.
|
|
285
429
|
spot_placer.SpotPlacer.from_task(task.service, task)
|
|
@@ -300,7 +444,7 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
300
444
|
raise ValueError(
|
|
301
445
|
'`spot_placer` is only supported for spot resources. '
|
|
302
446
|
'Please explicitly specify `use_spot: true` in resources.')
|
|
303
|
-
if task.service.ports is None:
|
|
447
|
+
if not pool and task.service.ports is None:
|
|
304
448
|
requested_ports = list(
|
|
305
449
|
resources_utils.port_ranges_to_set(requested_resources.ports))
|
|
306
450
|
if len(requested_ports) != 1:
|
|
@@ -320,10 +464,16 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
320
464
|
f'Got multiple ports: {service_port} and '
|
|
321
465
|
f'{replica_ingress_port} in different resources. '
|
|
322
466
|
'Please specify the same port instead.')
|
|
467
|
+
if pool:
|
|
468
|
+
if (task.service.ports is not None or
|
|
469
|
+
requested_resources.ports is not None):
|
|
470
|
+
with ux_utils.print_exception_no_traceback():
|
|
471
|
+
raise ValueError('Cannot specify ports in a cluster pool.')
|
|
323
472
|
|
|
324
473
|
|
|
325
|
-
def generate_service_name():
|
|
326
|
-
|
|
474
|
+
def generate_service_name(pool: bool = False):
|
|
475
|
+
noun = 'pool' if pool else 'service'
|
|
476
|
+
return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
|
|
327
477
|
|
|
328
478
|
|
|
329
479
|
def generate_remote_service_dir_name(service_name: str) -> str:
|
|
@@ -390,6 +540,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
|
|
|
390
540
|
|
|
391
541
|
|
|
392
542
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
|
543
|
+
# NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
|
|
544
|
+
# checking replica cluster existence. Be careful when changing it.
|
|
393
545
|
return f'{service_name}-{replica_id}'
|
|
394
546
|
|
|
395
547
|
|
|
@@ -425,26 +577,63 @@ def set_service_status_and_active_versions_from_replica(
|
|
|
425
577
|
active_versions=active_versions)
|
|
426
578
|
|
|
427
579
|
|
|
428
|
-
def update_service_status() -> None:
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
580
|
+
def update_service_status(pool: bool) -> None:
|
|
581
|
+
noun = 'pool' if pool else 'serve'
|
|
582
|
+
capnoun = noun.capitalize()
|
|
583
|
+
service_names = serve_state.get_glob_service_names(None)
|
|
584
|
+
for service_name in service_names:
|
|
585
|
+
record = _get_service_status(service_name,
|
|
586
|
+
pool=pool,
|
|
587
|
+
with_replica_info=False)
|
|
588
|
+
if record is None:
|
|
589
|
+
continue
|
|
590
|
+
service_status = record['status']
|
|
591
|
+
if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
|
|
432
592
|
# Skip services that is shutting down.
|
|
433
593
|
continue
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
594
|
+
|
|
595
|
+
logger.info(f'Update {noun} status for {service_name!r} '
|
|
596
|
+
f'with status {service_status}')
|
|
597
|
+
|
|
598
|
+
controller_pid = record['controller_pid']
|
|
599
|
+
if controller_pid is None:
|
|
600
|
+
logger.info(f'{capnoun} {service_name!r} controller pid is None. '
|
|
601
|
+
f'Unexpected status {service_status}. Set to failure.')
|
|
602
|
+
elif controller_pid < 0:
|
|
603
|
+
# Backwards compatibility: this service was submitted when ray was
|
|
604
|
+
# still used for controller process management. We set the
|
|
605
|
+
# value_to_replace_existing_entries to -1 to indicate historical
|
|
606
|
+
# services.
|
|
607
|
+
# TODO(tian): Remove before 0.13.0.
|
|
608
|
+
controller_job_id = record['controller_job_id']
|
|
609
|
+
assert controller_job_id is not None
|
|
610
|
+
controller_status = job_lib.get_status(controller_job_id)
|
|
611
|
+
if (controller_status is not None and
|
|
612
|
+
not controller_status.is_terminal()):
|
|
613
|
+
continue
|
|
614
|
+
logger.info(f'Updating {noun} {service_name!r} in old version. '
|
|
615
|
+
f'SkyPilot job status: {controller_status}. '
|
|
616
|
+
'Set to failure.')
|
|
617
|
+
else:
|
|
618
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
619
|
+
# The controller is still running.
|
|
620
|
+
continue
|
|
621
|
+
logger.info(f'{capnoun} {service_name!r} controller pid '
|
|
622
|
+
f'{controller_pid} is not alive. Set to failure.')
|
|
623
|
+
|
|
624
|
+
# If controller job is not running, set it as controller failed.
|
|
625
|
+
serve_state.set_service_status_and_active_versions(
|
|
626
|
+
service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
|
|
441
627
|
|
|
442
628
|
|
|
443
|
-
def update_service_encoded(service_name: str, version: int, mode: str
|
|
444
|
-
|
|
629
|
+
def update_service_encoded(service_name: str, version: int, mode: str,
|
|
630
|
+
pool: bool) -> str:
|
|
631
|
+
noun = 'pool' if pool else 'service'
|
|
632
|
+
capnoun = noun.capitalize()
|
|
633
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
445
634
|
if service_status is None:
|
|
446
635
|
with ux_utils.print_exception_no_traceback():
|
|
447
|
-
raise ValueError(f'
|
|
636
|
+
raise ValueError(f'{capnoun} {service_name!r} does not exist.')
|
|
448
637
|
controller_port = service_status['controller_port']
|
|
449
638
|
resp = requests.post(
|
|
450
639
|
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
@@ -455,27 +644,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
|
|
455
644
|
})
|
|
456
645
|
if resp.status_code == 404:
|
|
457
646
|
with ux_utils.print_exception_no_traceback():
|
|
647
|
+
# This only happens for services since pool is added after the
|
|
648
|
+
# update feature is introduced.
|
|
458
649
|
raise ValueError(
|
|
459
650
|
'The service is up-ed in an old version and does not '
|
|
460
651
|
'support update. Please `sky serve down` '
|
|
461
652
|
'it first and relaunch the service. ')
|
|
462
653
|
elif resp.status_code == 400:
|
|
463
654
|
with ux_utils.print_exception_no_traceback():
|
|
464
|
-
raise ValueError(f'Client error during
|
|
655
|
+
raise ValueError(f'Client error during {noun} update: {resp.text}')
|
|
465
656
|
elif resp.status_code == 500:
|
|
466
657
|
with ux_utils.print_exception_no_traceback():
|
|
467
658
|
raise RuntimeError(
|
|
468
|
-
f'Server error during
|
|
659
|
+
f'Server error during {noun} update: {resp.text}')
|
|
469
660
|
elif resp.status_code != 200:
|
|
470
661
|
with ux_utils.print_exception_no_traceback():
|
|
471
|
-
raise ValueError(f'Failed to update
|
|
662
|
+
raise ValueError(f'Failed to update {noun}: {resp.text}')
|
|
472
663
|
|
|
473
664
|
service_msg = resp.json()['message']
|
|
474
665
|
return message_utils.encode_payload(service_msg)
|
|
475
666
|
|
|
476
667
|
|
|
477
668
|
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
478
|
-
|
|
669
|
+
# TODO(tian): Currently pool does not support terminating replica.
|
|
670
|
+
service_status = _get_service_status(service_name, pool=False)
|
|
479
671
|
if service_status is None:
|
|
480
672
|
with ux_utils.print_exception_no_traceback():
|
|
481
673
|
raise ValueError(f'Service {service_name!r} does not exist.')
|
|
@@ -506,6 +698,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
|
506
698
|
|
|
507
699
|
def _get_service_status(
|
|
508
700
|
service_name: str,
|
|
701
|
+
pool: bool,
|
|
509
702
|
with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
|
|
510
703
|
"""Get the status dict of the service.
|
|
511
704
|
|
|
@@ -520,34 +713,96 @@ def _get_service_status(
|
|
|
520
713
|
record = serve_state.get_service_from_name(service_name)
|
|
521
714
|
if record is None:
|
|
522
715
|
return None
|
|
716
|
+
if record['pool'] != pool:
|
|
717
|
+
return None
|
|
718
|
+
|
|
719
|
+
record['pool_yaml'] = ''
|
|
720
|
+
if record['pool']:
|
|
721
|
+
latest_yaml_path = generate_task_yaml_file_name(service_name,
|
|
722
|
+
record['version'])
|
|
723
|
+
raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
|
|
724
|
+
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
725
|
+
if original_config is None:
|
|
726
|
+
# Fall back to old display format.
|
|
727
|
+
original_config = raw_yaml_config
|
|
728
|
+
original_config.pop('run', None)
|
|
729
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
730
|
+
if svc is not None:
|
|
731
|
+
svc.pop('pool', None) # Remove pool from service config
|
|
732
|
+
original_config['pool'] = svc # Add pool to root config
|
|
733
|
+
else:
|
|
734
|
+
original_config = yaml_utils.safe_load(original_config)
|
|
735
|
+
record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
|
|
736
|
+
|
|
737
|
+
record['target_num_replicas'] = 0
|
|
738
|
+
try:
|
|
739
|
+
controller_port = record['controller_port']
|
|
740
|
+
resp = requests.get(
|
|
741
|
+
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
742
|
+
'/autoscaler/info')
|
|
743
|
+
record['target_num_replicas'] = resp.json()['target_num_replicas']
|
|
744
|
+
except requests.exceptions.RequestException:
|
|
745
|
+
record['target_num_replicas'] = None
|
|
746
|
+
except Exception as e: # pylint: disable=broad-except
|
|
747
|
+
logger.error(f'Failed to get autoscaler info for {service_name}: '
|
|
748
|
+
f'{common_utils.format_exception(e)}\n'
|
|
749
|
+
f'Traceback: {traceback.format_exc()}')
|
|
750
|
+
|
|
523
751
|
if with_replica_info:
|
|
524
752
|
record['replica_info'] = [
|
|
525
|
-
info.to_info_dict(with_handle=True)
|
|
753
|
+
info.to_info_dict(with_handle=True, with_url=not pool)
|
|
526
754
|
for info in serve_state.get_replica_infos(service_name)
|
|
527
755
|
]
|
|
756
|
+
if pool:
|
|
757
|
+
for replica_info in record['replica_info']:
|
|
758
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
759
|
+
service_name, replica_info['name'])
|
|
760
|
+
replica_info['used_by'] = job_ids[0] if job_ids else None
|
|
528
761
|
return record
|
|
529
762
|
|
|
530
763
|
|
|
531
|
-
def
|
|
764
|
+
def get_service_status_pickled(service_names: Optional[List[str]],
|
|
765
|
+
pool: bool) -> List[Dict[str, str]]:
|
|
532
766
|
service_statuses: List[Dict[str, str]] = []
|
|
533
767
|
if service_names is None:
|
|
534
768
|
# Get all service names
|
|
535
769
|
service_names = serve_state.get_glob_service_names(None)
|
|
536
770
|
for service_name in service_names:
|
|
537
|
-
service_status = _get_service_status(service_name)
|
|
771
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
538
772
|
if service_status is None:
|
|
539
773
|
continue
|
|
540
774
|
service_statuses.append({
|
|
541
775
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
|
542
776
|
for k, v in service_status.items()
|
|
543
777
|
})
|
|
778
|
+
return sorted(service_statuses, key=lambda x: x['name'])
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
782
|
+
def get_service_status_encoded(service_names: Optional[List[str]],
|
|
783
|
+
pool: bool) -> str:
|
|
544
784
|
# We have to use payload_type here to avoid the issue of
|
|
545
785
|
# message_utils.decode_payload() not being able to correctly decode the
|
|
546
786
|
# message with <sky-payload> tags.
|
|
787
|
+
service_statuses = get_service_status_pickled(service_names, pool)
|
|
547
788
|
return message_utils.encode_payload(service_statuses,
|
|
548
789
|
payload_type='service_status')
|
|
549
790
|
|
|
550
791
|
|
|
792
|
+
def unpickle_service_status(
|
|
793
|
+
payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
|
|
794
|
+
service_statuses: List[Dict[str, Any]] = []
|
|
795
|
+
for service_status in payload:
|
|
796
|
+
if not isinstance(service_status, dict):
|
|
797
|
+
raise ValueError(f'Invalid service status: {service_status}')
|
|
798
|
+
service_statuses.append({
|
|
799
|
+
k: pickle.loads(base64.b64decode(v))
|
|
800
|
+
for k, v in service_status.items()
|
|
801
|
+
})
|
|
802
|
+
return service_statuses
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
551
806
|
def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
552
807
|
try:
|
|
553
808
|
service_statuses_encoded = message_utils.decode_payload(
|
|
@@ -559,26 +814,85 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
|
559
814
|
service_statuses_encoded = message_utils.decode_payload(payload)
|
|
560
815
|
else:
|
|
561
816
|
raise
|
|
562
|
-
|
|
563
|
-
for service_status in service_statuses_encoded:
|
|
564
|
-
if not isinstance(service_status, dict):
|
|
565
|
-
raise ValueError(f'Invalid service status: {service_status}')
|
|
566
|
-
service_statuses.append({
|
|
567
|
-
k: pickle.loads(base64.b64decode(v))
|
|
568
|
-
for k, v in service_status.items()
|
|
569
|
-
})
|
|
570
|
-
return service_statuses
|
|
817
|
+
return unpickle_service_status(service_statuses_encoded)
|
|
571
818
|
|
|
572
819
|
|
|
820
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
573
821
|
def add_version_encoded(service_name: str) -> str:
|
|
574
822
|
new_version = serve_state.add_version(service_name)
|
|
575
823
|
return message_utils.encode_payload(new_version)
|
|
576
824
|
|
|
577
825
|
|
|
826
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
578
827
|
def load_version_string(payload: str) -> str:
|
|
579
828
|
return message_utils.decode_payload(payload)
|
|
580
829
|
|
|
581
830
|
|
|
831
|
+
def get_ready_replicas(
|
|
832
|
+
service_name: str) -> List['replica_managers.ReplicaInfo']:
|
|
833
|
+
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
834
|
+
return [
|
|
835
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
836
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
837
|
+
]
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
841
|
+
"""Get the next available cluster name from idle replicas.
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
service_name: The name of the service.
|
|
845
|
+
job_id: Optional job ID to associate with the acquired cluster.
|
|
846
|
+
If None, a placeholder will be used.
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
The cluster name if an idle replica is found, None otherwise.
|
|
850
|
+
"""
|
|
851
|
+
# Check if service exists
|
|
852
|
+
service_status = _get_service_status(service_name,
|
|
853
|
+
pool=True,
|
|
854
|
+
with_replica_info=False)
|
|
855
|
+
if service_status is None:
|
|
856
|
+
logger.error(f'Service {service_name!r} does not exist.')
|
|
857
|
+
return None
|
|
858
|
+
if not service_status['pool']:
|
|
859
|
+
logger.error(f'Service {service_name!r} is not a cluster pool.')
|
|
860
|
+
return None
|
|
861
|
+
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
862
|
+
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
863
|
+
ready_replicas = get_ready_replicas(service_name)
|
|
864
|
+
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
865
|
+
for replica_info in ready_replicas:
|
|
866
|
+
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
867
|
+
service_name, replica_info.cluster_name)
|
|
868
|
+
# TODO(tian): Make it resources aware. Currently we allow and only
|
|
869
|
+
# allow one job per replica. In the following PR, we should:
|
|
870
|
+
# i) When the replica is launched with `any_of` resources (
|
|
871
|
+
# replicas can have different resources), we should check if
|
|
872
|
+
# the resources that jobs require are available on the replica.
|
|
873
|
+
# e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
|
|
874
|
+
# should only goes to replica with A100.
|
|
875
|
+
# ii) When a job only requires a subset of the resources on the
|
|
876
|
+
# replica, each replica should be able to handle multiple jobs
|
|
877
|
+
# at the same time. e.g., if a job requires A100:1 on a A100:8
|
|
878
|
+
# pool, it should be able to run 4 jobs at the same time.
|
|
879
|
+
if not jobs_on_replica:
|
|
880
|
+
idle_replicas.append(replica_info)
|
|
881
|
+
if not idle_replicas:
|
|
882
|
+
logger.info(f'No idle replicas found for pool {service_name!r}')
|
|
883
|
+
return None
|
|
884
|
+
|
|
885
|
+
# Select the first idle replica.
|
|
886
|
+
# TODO(tian): "Load balancing" policy.
|
|
887
|
+
replica_info = idle_replicas[0]
|
|
888
|
+
logger.info(f'Selected replica {replica_info.replica_id} with cluster '
|
|
889
|
+
f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
|
|
890
|
+
f'{service_name!r}')
|
|
891
|
+
managed_job_state.set_current_cluster_name(job_id,
|
|
892
|
+
replica_info.cluster_name)
|
|
893
|
+
return replica_info.cluster_name
|
|
894
|
+
|
|
895
|
+
|
|
582
896
|
def _terminate_failed_services(
|
|
583
897
|
service_name: str,
|
|
584
898
|
service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
|
|
@@ -598,8 +912,8 @@ def _terminate_failed_services(
|
|
|
598
912
|
# replicas, so we don't need to try again here.
|
|
599
913
|
for replica_info in serve_state.get_replica_infos(service_name):
|
|
600
914
|
# TODO(tian): Refresh latest status of the cluster.
|
|
601
|
-
if global_user_state.
|
|
602
|
-
replica_info.cluster_name)
|
|
915
|
+
if global_user_state.cluster_with_name_exists(
|
|
916
|
+
replica_info.cluster_name):
|
|
603
917
|
remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
|
|
604
918
|
serve_state.remove_replica(service_name, replica_info.replica_id)
|
|
605
919
|
|
|
@@ -608,9 +922,11 @@ def _terminate_failed_services(
|
|
|
608
922
|
shutil.rmtree(service_dir)
|
|
609
923
|
serve_state.remove_service(service_name)
|
|
610
924
|
serve_state.delete_all_versions(service_name)
|
|
925
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
611
926
|
|
|
612
927
|
if not remaining_replica_clusters:
|
|
613
928
|
return None
|
|
929
|
+
# TODO(tian): Try to terminate those replica clusters.
|
|
614
930
|
remaining_identity = ', '.join(remaining_replica_clusters)
|
|
615
931
|
return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
|
|
616
932
|
f'failed status ({service_status}). This may indicate a resource '
|
|
@@ -618,17 +934,38 @@ def _terminate_failed_services(
|
|
|
618
934
|
f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
|
|
619
935
|
|
|
620
936
|
|
|
621
|
-
def terminate_services(service_names: Optional[List[str]], purge: bool
|
|
937
|
+
def terminate_services(service_names: Optional[List[str]], purge: bool,
|
|
938
|
+
pool: bool) -> str:
|
|
939
|
+
noun = 'pool' if pool else 'service'
|
|
940
|
+
capnoun = noun.capitalize()
|
|
622
941
|
service_names = serve_state.get_glob_service_names(service_names)
|
|
623
942
|
terminated_service_names: List[str] = []
|
|
624
943
|
messages: List[str] = []
|
|
625
944
|
for service_name in service_names:
|
|
626
945
|
service_status = _get_service_status(service_name,
|
|
946
|
+
pool=pool,
|
|
627
947
|
with_replica_info=False)
|
|
948
|
+
if service_status is None:
|
|
949
|
+
continue
|
|
628
950
|
if (service_status is not None and service_status['status']
|
|
629
951
|
== serve_state.ServiceStatus.SHUTTING_DOWN):
|
|
630
952
|
# Already scheduled to be terminated.
|
|
631
953
|
continue
|
|
954
|
+
if pool:
|
|
955
|
+
nonterminal_job_ids = (
|
|
956
|
+
managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
|
|
957
|
+
if nonterminal_job_ids:
|
|
958
|
+
nonterminal_job_ids_str = ','.join(
|
|
959
|
+
str(job_id) for job_id in nonterminal_job_ids)
|
|
960
|
+
num_nonterminal_jobs = len(nonterminal_job_ids)
|
|
961
|
+
messages.append(
|
|
962
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
|
|
963
|
+
f'{num_nonterminal_jobs} nonterminal jobs: '
|
|
964
|
+
f'{nonterminal_job_ids_str}. To terminate the {noun}, '
|
|
965
|
+
f'please run `sky jobs cancel --pool {service_name}` to '
|
|
966
|
+
'cancel all jobs in the pool first.'
|
|
967
|
+
f'{colorama.Style.RESET_ALL}')
|
|
968
|
+
continue
|
|
632
969
|
# If the `services` and `version_specs` table are not aligned, it might
|
|
633
970
|
# result in a None service status. In this case, the controller process
|
|
634
971
|
# is not functioning as well and we should also use the
|
|
@@ -636,10 +973,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
636
973
|
# This is a safeguard for a rare case, that is accidentally abort
|
|
637
974
|
# between `serve_state.add_service` and
|
|
638
975
|
# `serve_state.add_or_update_version` in service.py.
|
|
639
|
-
|
|
976
|
+
purge_cmd = (f'sky jobs pool down {service_name} --purge'
|
|
977
|
+
if pool else f'sky serve down {service_name} --purge')
|
|
978
|
+
if (service_status['status']
|
|
640
979
|
in serve_state.ServiceStatus.failed_statuses()):
|
|
641
|
-
failed_status =
|
|
642
|
-
if service_status is not None else None)
|
|
980
|
+
failed_status = service_status['status']
|
|
643
981
|
if purge:
|
|
644
982
|
message = _terminate_failed_services(service_name,
|
|
645
983
|
failed_status)
|
|
@@ -647,11 +985,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
647
985
|
messages.append(message)
|
|
648
986
|
else:
|
|
649
987
|
messages.append(
|
|
650
|
-
f'{colorama.Fore.YELLOW}
|
|
988
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
|
|
651
989
|
f'failed status ({failed_status}). Skipping '
|
|
652
990
|
'its termination as it could lead to a resource leak. '
|
|
653
|
-
f'(Use `
|
|
654
|
-
'forcefully terminate the service.)'
|
|
991
|
+
f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
|
|
655
992
|
f'{colorama.Style.RESET_ALL}')
|
|
656
993
|
# Don't add to terminated_service_names since it's not
|
|
657
994
|
# actually terminated.
|
|
@@ -668,17 +1005,18 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
668
1005
|
f.flush()
|
|
669
1006
|
terminated_service_names.append(f'{service_name!r}')
|
|
670
1007
|
if not terminated_service_names:
|
|
671
|
-
messages.append('No
|
|
1008
|
+
messages.append(f'No {noun} to terminate.')
|
|
672
1009
|
else:
|
|
673
|
-
identity_str = f'
|
|
1010
|
+
identity_str = f'{capnoun} {terminated_service_names[0]} is'
|
|
674
1011
|
if len(terminated_service_names) > 1:
|
|
675
1012
|
terminated_service_names_str = ', '.join(terminated_service_names)
|
|
676
|
-
identity_str = f'
|
|
1013
|
+
identity_str = f'{capnoun}s {terminated_service_names_str} are'
|
|
677
1014
|
messages.append(f'{identity_str} scheduled to be terminated.')
|
|
678
1015
|
return '\n'.join(messages)
|
|
679
1016
|
|
|
680
1017
|
|
|
681
|
-
def wait_service_registration(service_name: str, job_id: int
|
|
1018
|
+
def wait_service_registration(service_name: str, job_id: int,
|
|
1019
|
+
pool: bool) -> str:
|
|
682
1020
|
"""Util function to call at the end of `sky.serve.up()`.
|
|
683
1021
|
|
|
684
1022
|
This function will:
|
|
@@ -691,49 +1029,67 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
|
691
1029
|
Returns:
|
|
692
1030
|
Encoded load balancer port assigned to the service.
|
|
693
1031
|
"""
|
|
1032
|
+
# TODO (kyuds): when codegen is fully deprecated, return the lb port
|
|
1033
|
+
# as an int directly instead of encoding it.
|
|
694
1034
|
start_time = time.time()
|
|
695
1035
|
setup_completed = False
|
|
1036
|
+
noun = 'pool' if pool else 'service'
|
|
696
1037
|
while True:
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
if
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
1038
|
+
# Only do this check for non-consolidation mode as consolidation mode
|
|
1039
|
+
# has no setup process.
|
|
1040
|
+
if not is_consolidation_mode(pool):
|
|
1041
|
+
job_status = job_lib.get_status(job_id)
|
|
1042
|
+
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
1043
|
+
# Wait for the controller process to finish setting up. It
|
|
1044
|
+
# can be slow if a lot cloud dependencies are being installed.
|
|
1045
|
+
if (time.time() - start_time >
|
|
1046
|
+
constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
|
|
1047
|
+
with ux_utils.print_exception_no_traceback():
|
|
1048
|
+
raise RuntimeError(
|
|
1049
|
+
f'Failed to start the controller process for '
|
|
1050
|
+
f'the {noun} {service_name!r} within '
|
|
1051
|
+
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
|
|
1052
|
+
f' seconds.')
|
|
1053
|
+
# No need to check the service status as the controller process
|
|
1054
|
+
# is still setting up.
|
|
1055
|
+
time.sleep(1)
|
|
1056
|
+
continue
|
|
714
1057
|
|
|
715
1058
|
if not setup_completed:
|
|
716
1059
|
setup_completed = True
|
|
717
1060
|
# Reset the start time to wait for the service to be registered.
|
|
718
1061
|
start_time = time.time()
|
|
719
1062
|
|
|
720
|
-
record =
|
|
1063
|
+
record = _get_service_status(service_name,
|
|
1064
|
+
pool=pool,
|
|
1065
|
+
with_replica_info=False)
|
|
721
1066
|
if record is not None:
|
|
722
1067
|
if job_id != record['controller_job_id']:
|
|
1068
|
+
if pool:
|
|
1069
|
+
command_to_run = 'sky jobs pool apply --pool'
|
|
1070
|
+
else:
|
|
1071
|
+
command_to_run = 'sky serve update'
|
|
723
1072
|
with ux_utils.print_exception_no_traceback():
|
|
724
1073
|
raise ValueError(
|
|
725
|
-
f'The
|
|
726
|
-
'Please specify a different name for your
|
|
727
|
-
'To update an existing
|
|
728
|
-
f'{service_name} <new-
|
|
1074
|
+
f'The {noun} {service_name!r} is already running. '
|
|
1075
|
+
f'Please specify a different name for your {noun}. '
|
|
1076
|
+
f'To update an existing {noun}, run: {command_to_run}'
|
|
1077
|
+
f' {service_name} <new-{noun}-yaml>')
|
|
729
1078
|
lb_port = record['load_balancer_port']
|
|
730
1079
|
if lb_port is not None:
|
|
731
1080
|
return message_utils.encode_payload(lb_port)
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
1081
|
+
else:
|
|
1082
|
+
controller_log_path = os.path.expanduser(
|
|
1083
|
+
generate_remote_controller_log_file_name(service_name))
|
|
1084
|
+
if os.path.exists(controller_log_path):
|
|
1085
|
+
with open(controller_log_path, 'r', encoding='utf-8') as f:
|
|
1086
|
+
log_content = f.read()
|
|
1087
|
+
if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
|
|
1088
|
+
in log_content):
|
|
1089
|
+
with ux_utils.print_exception_no_traceback():
|
|
1090
|
+
raise RuntimeError('Max number of services reached. '
|
|
1091
|
+
'To spin up more services, please '
|
|
1092
|
+
'tear down some existing services.')
|
|
737
1093
|
elapsed = time.time() - start_time
|
|
738
1094
|
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
|
|
739
1095
|
# Print the controller log to help user debug.
|
|
@@ -754,12 +1110,16 @@ def load_service_initialization_result(payload: str) -> int:
|
|
|
754
1110
|
return message_utils.decode_payload(payload)
|
|
755
1111
|
|
|
756
1112
|
|
|
757
|
-
def
|
|
758
|
-
|
|
1113
|
+
def _check_service_status_healthy(service_name: str,
|
|
1114
|
+
pool: bool) -> Optional[str]:
|
|
1115
|
+
service_record = _get_service_status(service_name,
|
|
1116
|
+
pool,
|
|
1117
|
+
with_replica_info=False)
|
|
1118
|
+
capnoun = 'Service' if not pool else 'Pool'
|
|
759
1119
|
if service_record is None:
|
|
760
|
-
return f'
|
|
1120
|
+
return f'{capnoun} {service_name!r} does not exist.'
|
|
761
1121
|
if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
|
|
762
|
-
return (f'
|
|
1122
|
+
return (f'{capnoun} {service_name!r} is still initializing its '
|
|
763
1123
|
'controller. Please try again later.')
|
|
764
1124
|
return None
|
|
765
1125
|
|
|
@@ -782,6 +1142,73 @@ def get_latest_version_with_min_replicas(
|
|
|
782
1142
|
return active_versions[-1] if active_versions else None
|
|
783
1143
|
|
|
784
1144
|
|
|
1145
|
+
def _process_line(line: str,
|
|
1146
|
+
cluster_name: str,
|
|
1147
|
+
stop_on_eof: bool = False) -> Iterator[str]:
|
|
1148
|
+
# The line might be directing users to view logs, like
|
|
1149
|
+
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
1150
|
+
# We should tail the detailed logs for user.
|
|
1151
|
+
def cluster_is_up() -> bool:
|
|
1152
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
1153
|
+
return status == status_lib.ClusterStatus.UP
|
|
1154
|
+
|
|
1155
|
+
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1156
|
+
line)
|
|
1157
|
+
provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
|
|
1158
|
+
line)
|
|
1159
|
+
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1160
|
+
|
|
1161
|
+
def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
|
|
1162
|
+
try:
|
|
1163
|
+
with open(p, 'r', newline='', encoding='utf-8') as f:
|
|
1164
|
+
# Exit if >10s without new content to avoid hanging when INIT
|
|
1165
|
+
yield from log_utils.follow_logs(f,
|
|
1166
|
+
should_stop=cluster_is_up,
|
|
1167
|
+
stop_on_eof=stop_on_eof,
|
|
1168
|
+
idle_timeout_seconds=10)
|
|
1169
|
+
except FileNotFoundError:
|
|
1170
|
+
# Fall back cleanly if the hinted path doesn't exist
|
|
1171
|
+
yield line
|
|
1172
|
+
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
1173
|
+
f'Try to expand log file {p} but not found. Skipping...'
|
|
1174
|
+
f'{colorama.Style.RESET_ALL}')
|
|
1175
|
+
return
|
|
1176
|
+
|
|
1177
|
+
if provision_api_log_prompt is not None:
|
|
1178
|
+
rel_path = provision_api_log_prompt.group(1)
|
|
1179
|
+
nested_log_path = pathlib.Path(
|
|
1180
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1181
|
+
rel_path).resolve()
|
|
1182
|
+
yield from _stream_provision_path(nested_log_path)
|
|
1183
|
+
return
|
|
1184
|
+
|
|
1185
|
+
if provision_log_cmd_prompt is not None:
|
|
1186
|
+
# Resolve provision log via cluster table first, then history.
|
|
1187
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1188
|
+
cluster_name)
|
|
1189
|
+
if not log_path_str:
|
|
1190
|
+
log_path_str = (
|
|
1191
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1192
|
+
cluster_name))
|
|
1193
|
+
if not log_path_str:
|
|
1194
|
+
yield line
|
|
1195
|
+
return
|
|
1196
|
+
yield from _stream_provision_path(
|
|
1197
|
+
pathlib.Path(log_path_str).expanduser().resolve())
|
|
1198
|
+
return
|
|
1199
|
+
|
|
1200
|
+
if log_prompt is not None:
|
|
1201
|
+
# Now we skip other logs (file sync logs) since we lack
|
|
1202
|
+
# utility to determine when these log files are finished
|
|
1203
|
+
# writing.
|
|
1204
|
+
# TODO(tian): We should not skip these logs since there are
|
|
1205
|
+
# small chance that error will happen in file sync. Need to
|
|
1206
|
+
# find a better way to do this.
|
|
1207
|
+
return
|
|
1208
|
+
|
|
1209
|
+
yield line
|
|
1210
|
+
|
|
1211
|
+
|
|
785
1212
|
def _follow_logs_with_provision_expanding(
|
|
786
1213
|
file: TextIO,
|
|
787
1214
|
cluster_name: str,
|
|
@@ -804,51 +1231,8 @@ def _follow_logs_with_provision_expanding(
|
|
|
804
1231
|
Log lines, including expanded content from referenced provision logs.
|
|
805
1232
|
"""
|
|
806
1233
|
|
|
807
|
-
def cluster_is_up() -> bool:
|
|
808
|
-
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
|
809
|
-
if cluster_record is None:
|
|
810
|
-
return False
|
|
811
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
812
|
-
|
|
813
1234
|
def process_line(line: str) -> Iterator[str]:
|
|
814
|
-
|
|
815
|
-
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
816
|
-
# We should tail the detailed logs for user.
|
|
817
|
-
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
|
818
|
-
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
819
|
-
|
|
820
|
-
if provision_log_prompt is not None:
|
|
821
|
-
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
|
822
|
-
|
|
823
|
-
try:
|
|
824
|
-
with open(nested_log_path, 'r', newline='',
|
|
825
|
-
encoding='utf-8') as f:
|
|
826
|
-
# We still exit if more than 10 seconds without new content
|
|
827
|
-
# to avoid any internal bug that causes the launch to fail
|
|
828
|
-
# while cluster status remains INIT.
|
|
829
|
-
yield from log_utils.follow_logs(f,
|
|
830
|
-
should_stop=cluster_is_up,
|
|
831
|
-
stop_on_eof=stop_on_eof,
|
|
832
|
-
idle_timeout_seconds=10)
|
|
833
|
-
except FileNotFoundError:
|
|
834
|
-
yield line
|
|
835
|
-
|
|
836
|
-
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
837
|
-
f'Try to expand log file {nested_log_path} but not '
|
|
838
|
-
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
|
839
|
-
pass
|
|
840
|
-
return
|
|
841
|
-
|
|
842
|
-
if log_prompt is not None:
|
|
843
|
-
# Now we skip other logs (file sync logs) since we lack
|
|
844
|
-
# utility to determine when these log files are finished
|
|
845
|
-
# writing.
|
|
846
|
-
# TODO(tian): We should not skip these logs since there are
|
|
847
|
-
# small chance that error will happen in file sync. Need to
|
|
848
|
-
# find a better way to do this.
|
|
849
|
-
return
|
|
850
|
-
|
|
851
|
-
yield line
|
|
1235
|
+
yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
|
|
852
1236
|
|
|
853
1237
|
return log_utils.follow_logs(file,
|
|
854
1238
|
should_stop=should_stop,
|
|
@@ -857,24 +1241,59 @@ def _follow_logs_with_provision_expanding(
|
|
|
857
1241
|
idle_timeout_seconds=idle_timeout_seconds)
|
|
858
1242
|
|
|
859
1243
|
|
|
860
|
-
def
|
|
861
|
-
|
|
862
|
-
|
|
1244
|
+
def _capped_follow_logs_with_provision_expanding(
|
|
1245
|
+
log_list: List[str],
|
|
1246
|
+
cluster_name: str,
|
|
1247
|
+
*,
|
|
1248
|
+
line_cap: int = 100,
|
|
1249
|
+
) -> Iterator[str]:
|
|
1250
|
+
"""Follows logs and expands any provision.log references found.
|
|
1251
|
+
|
|
1252
|
+
Args:
|
|
1253
|
+
log_list: List of Log Lines to read from.
|
|
1254
|
+
cluster_name: Name of the cluster being launched.
|
|
1255
|
+
line_cap: Number of last lines to return
|
|
1256
|
+
|
|
1257
|
+
Yields:
|
|
1258
|
+
Log lines, including expanded content from referenced provision logs.
|
|
1259
|
+
"""
|
|
1260
|
+
all_lines: Deque[str] = collections.deque(maxlen=line_cap)
|
|
1261
|
+
|
|
1262
|
+
for line in log_list:
|
|
1263
|
+
for processed in _process_line(line=line,
|
|
1264
|
+
cluster_name=cluster_name,
|
|
1265
|
+
stop_on_eof=False):
|
|
1266
|
+
all_lines.append(processed)
|
|
1267
|
+
|
|
1268
|
+
yield from all_lines
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
1272
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1273
|
+
msg = _check_service_status_healthy(service_name, pool=pool)
|
|
863
1274
|
if msg is not None:
|
|
864
1275
|
return msg
|
|
1276
|
+
repnoun = 'worker' if pool else 'replica'
|
|
1277
|
+
caprepnoun = repnoun.capitalize()
|
|
865
1278
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
|
|
866
|
-
f'of
|
|
867
|
-
|
|
1279
|
+
f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
|
|
868
1280
|
log_file_name = generate_replica_log_file_name(service_name, replica_id)
|
|
869
1281
|
if os.path.exists(log_file_name):
|
|
870
|
-
|
|
871
|
-
|
|
1282
|
+
if tail is not None:
|
|
1283
|
+
lines = common_utils.read_last_n_lines(log_file_name, tail)
|
|
1284
|
+
for line in lines:
|
|
1285
|
+
if not line.endswith('\n'):
|
|
1286
|
+
line += '\n'
|
|
1287
|
+
print(line, end='', flush=True)
|
|
1288
|
+
else:
|
|
1289
|
+
with open(log_file_name, 'r', encoding='utf-8') as f:
|
|
1290
|
+
print(f.read(), flush=True)
|
|
872
1291
|
return ''
|
|
873
1292
|
|
|
874
1293
|
launch_log_file_name = generate_replica_launch_log_file_name(
|
|
875
1294
|
service_name, replica_id)
|
|
876
1295
|
if not os.path.exists(launch_log_file_name):
|
|
877
|
-
return (f'{colorama.Fore.RED}
|
|
1296
|
+
return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
|
|
878
1297
|
f'{colorama.Style.RESET_ALL}')
|
|
879
1298
|
|
|
880
1299
|
replica_cluster_name = generate_replica_cluster_name(
|
|
@@ -891,42 +1310,89 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
|
891
1310
|
|
|
892
1311
|
replica_provisioned = (
|
|
893
1312
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
1313
|
+
|
|
1314
|
+
# Handle launch logs based on number parameter
|
|
1315
|
+
final_lines_to_print = []
|
|
1316
|
+
if tail is not None:
|
|
1317
|
+
static_lines = common_utils.read_last_n_lines(launch_log_file_name,
|
|
1318
|
+
tail)
|
|
1319
|
+
lines = list(
|
|
1320
|
+
_capped_follow_logs_with_provision_expanding(
|
|
1321
|
+
log_list=static_lines,
|
|
1322
|
+
cluster_name=replica_cluster_name,
|
|
1323
|
+
line_cap=tail,
|
|
1324
|
+
))
|
|
1325
|
+
final_lines_to_print += lines
|
|
1326
|
+
else:
|
|
1327
|
+
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
|
1328
|
+
for line in _follow_logs_with_provision_expanding(
|
|
1329
|
+
f,
|
|
1330
|
+
replica_cluster_name,
|
|
1331
|
+
should_stop=replica_provisioned,
|
|
1332
|
+
stop_on_eof=not follow,
|
|
1333
|
+
):
|
|
1334
|
+
print(line, end='', flush=True)
|
|
902
1335
|
|
|
903
1336
|
if (not follow and
|
|
904
1337
|
_get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
|
|
905
1338
|
# Early exit if not following the logs.
|
|
1339
|
+
if tail is not None:
|
|
1340
|
+
for line in final_lines_to_print:
|
|
1341
|
+
if not line.endswith('\n'):
|
|
1342
|
+
line += '\n'
|
|
1343
|
+
print(line, end='', flush=True)
|
|
906
1344
|
return ''
|
|
907
1345
|
|
|
908
1346
|
backend = backends.CloudVmRayBackend()
|
|
909
1347
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
910
1348
|
replica_cluster_name)
|
|
911
1349
|
if handle is None:
|
|
1350
|
+
if tail is not None:
|
|
1351
|
+
for line in final_lines_to_print:
|
|
1352
|
+
if not line.endswith('\n'):
|
|
1353
|
+
line += '\n'
|
|
1354
|
+
print(line, end='', flush=True)
|
|
912
1355
|
return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
|
|
913
1356
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
914
1357
|
|
|
915
1358
|
# Notify user here to make sure user won't think the log is finished.
|
|
916
1359
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
|
|
917
|
-
f'of
|
|
1360
|
+
f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
|
|
918
1361
|
|
|
919
1362
|
# Always tail the latest logs, which represent user setup & run.
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1363
|
+
if tail is None:
|
|
1364
|
+
returncode = backend.tail_logs(handle, job_id=None, follow=follow)
|
|
1365
|
+
if returncode != 0:
|
|
1366
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
|
|
1367
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1368
|
+
elif not follow and tail > 0:
|
|
1369
|
+
final = backend.tail_logs(handle,
|
|
1370
|
+
job_id=None,
|
|
1371
|
+
follow=follow,
|
|
1372
|
+
tail=tail,
|
|
1373
|
+
stream_logs=False,
|
|
1374
|
+
require_outputs=True,
|
|
1375
|
+
process_stream=True)
|
|
1376
|
+
if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
|
|
1377
|
+
if tail is not None:
|
|
1378
|
+
for line in final_lines_to_print:
|
|
1379
|
+
if not line.endswith('\n'):
|
|
1380
|
+
line += '\n'
|
|
1381
|
+
print(line, end='', flush=True)
|
|
1382
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for replica '
|
|
1383
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1384
|
+
final_lines_to_print += final[1].splitlines()
|
|
1385
|
+
for line in final_lines_to_print[-tail:]:
|
|
1386
|
+
if not line.endswith('\n'):
|
|
1387
|
+
line += '\n'
|
|
1388
|
+
print(line, end='', flush=True)
|
|
924
1389
|
return ''
|
|
925
1390
|
|
|
926
1391
|
|
|
927
1392
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
928
|
-
follow: bool
|
|
929
|
-
|
|
1393
|
+
follow: bool, tail: Optional[int],
|
|
1394
|
+
pool: bool) -> str:
|
|
1395
|
+
msg = _check_service_status_healthy(service_name, pool)
|
|
930
1396
|
if msg is not None:
|
|
931
1397
|
return msg
|
|
932
1398
|
if stream_controller:
|
|
@@ -935,19 +1401,31 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
|
935
1401
|
log_file = generate_remote_load_balancer_log_file_name(service_name)
|
|
936
1402
|
|
|
937
1403
|
def _service_is_terminal() -> bool:
|
|
938
|
-
record =
|
|
1404
|
+
record = _get_service_status(service_name,
|
|
1405
|
+
pool,
|
|
1406
|
+
with_replica_info=False)
|
|
939
1407
|
if record is None:
|
|
940
1408
|
return True
|
|
941
1409
|
return record['status'] in serve_state.ServiceStatus.failed_statuses()
|
|
942
1410
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
):
|
|
1411
|
+
if tail is not None:
|
|
1412
|
+
lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
|
|
1413
|
+
tail)
|
|
1414
|
+
for line in lines:
|
|
1415
|
+
if not line.endswith('\n'):
|
|
1416
|
+
line += '\n'
|
|
950
1417
|
print(line, end='', flush=True)
|
|
1418
|
+
else:
|
|
1419
|
+
with open(os.path.expanduser(log_file),
|
|
1420
|
+
'r',
|
|
1421
|
+
newline='',
|
|
1422
|
+
encoding='utf-8') as f:
|
|
1423
|
+
for line in log_utils.follow_logs(
|
|
1424
|
+
f,
|
|
1425
|
+
should_stop=_service_is_terminal,
|
|
1426
|
+
stop_on_eof=not follow,
|
|
1427
|
+
):
|
|
1428
|
+
print(line, end='', flush=True)
|
|
951
1429
|
return ''
|
|
952
1430
|
|
|
953
1431
|
|
|
@@ -965,18 +1443,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
|
|
|
965
1443
|
return f'{ready_replica_num}/{total_replica_num}'
|
|
966
1444
|
|
|
967
1445
|
|
|
968
|
-
def format_service_table(service_records: List[Dict[str, Any]],
|
|
969
|
-
|
|
1446
|
+
def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
|
|
1447
|
+
pool: bool) -> str:
|
|
1448
|
+
noun = 'pool' if pool else 'service'
|
|
970
1449
|
if not service_records:
|
|
971
|
-
return 'No existing
|
|
1450
|
+
return f'No existing {noun}s.'
|
|
972
1451
|
|
|
973
1452
|
service_columns = [
|
|
974
|
-
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1453
|
+
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1454
|
+
'REPLICAS' if not pool else 'WORKERS'
|
|
975
1455
|
]
|
|
1456
|
+
if not pool:
|
|
1457
|
+
service_columns.append('ENDPOINT')
|
|
976
1458
|
if show_all:
|
|
977
1459
|
service_columns.extend([
|
|
978
1460
|
'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
|
|
979
1461
|
])
|
|
1462
|
+
if pool:
|
|
1463
|
+
# Remove the load balancing policy column for pools.
|
|
1464
|
+
service_columns.pop(-2)
|
|
980
1465
|
service_table = log_utils.create_table(service_columns)
|
|
981
1466
|
|
|
982
1467
|
replica_infos: List[Dict[str, Any]] = []
|
|
@@ -1007,37 +1492,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
|
1007
1492
|
uptime,
|
|
1008
1493
|
status_str,
|
|
1009
1494
|
replicas,
|
|
1010
|
-
endpoint,
|
|
1011
1495
|
]
|
|
1496
|
+
if not pool:
|
|
1497
|
+
service_values.append(endpoint)
|
|
1012
1498
|
if show_all:
|
|
1013
1499
|
service_values.extend(
|
|
1014
1500
|
[policy, load_balancing_policy, requested_resources_str])
|
|
1501
|
+
if pool:
|
|
1502
|
+
service_values.pop(-2)
|
|
1015
1503
|
service_table.add_row(service_values)
|
|
1016
1504
|
|
|
1017
|
-
replica_table = _format_replica_table(replica_infos, show_all)
|
|
1505
|
+
replica_table = _format_replica_table(replica_infos, show_all, pool)
|
|
1506
|
+
replica_noun = 'Pool Workers' if pool else 'Service Replicas'
|
|
1018
1507
|
return (f'{service_table}\n'
|
|
1019
1508
|
f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1020
|
-
f'
|
|
1509
|
+
f'{replica_noun}{colorama.Style.RESET_ALL}\n'
|
|
1021
1510
|
f'{replica_table}')
|
|
1022
1511
|
|
|
1023
1512
|
|
|
1024
|
-
def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1025
|
-
|
|
1513
|
+
def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
1514
|
+
pool: bool) -> str:
|
|
1515
|
+
noun = 'worker' if pool else 'replica'
|
|
1026
1516
|
if not replica_records:
|
|
1027
|
-
return 'No existing
|
|
1517
|
+
return f'No existing {noun}s.'
|
|
1028
1518
|
|
|
1029
1519
|
replica_columns = [
|
|
1030
|
-
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1031
|
-
'
|
|
1520
|
+
'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1521
|
+
'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
|
|
1032
1522
|
]
|
|
1033
|
-
if
|
|
1034
|
-
replica_columns.append('
|
|
1523
|
+
if pool:
|
|
1524
|
+
replica_columns.append('USED_BY')
|
|
1525
|
+
# Remove the endpoint column for pool workers.
|
|
1526
|
+
replica_columns.pop(3)
|
|
1035
1527
|
replica_table = log_utils.create_table(replica_columns)
|
|
1036
1528
|
|
|
1037
1529
|
truncate_hint = ''
|
|
1038
1530
|
if not show_all:
|
|
1039
1531
|
if len(replica_records) > _REPLICA_TRUNC_NUM:
|
|
1040
|
-
truncate_hint = '\n... (use --all to show all
|
|
1532
|
+
truncate_hint = f'\n... (use --all to show all {noun}s)'
|
|
1041
1533
|
replica_records = replica_records[:_REPLICA_TRUNC_NUM]
|
|
1042
1534
|
|
|
1043
1535
|
for record in replica_records:
|
|
@@ -1047,21 +1539,26 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1047
1539
|
version = (record['version'] if 'version' in record else '-')
|
|
1048
1540
|
replica_endpoint = endpoint if endpoint else '-'
|
|
1049
1541
|
launched_at = log_utils.readable_time_duration(record['launched_at'])
|
|
1542
|
+
infra = '-'
|
|
1050
1543
|
resources_str = '-'
|
|
1051
1544
|
replica_status = record['status']
|
|
1052
1545
|
status_str = replica_status.colored_str()
|
|
1053
|
-
|
|
1054
|
-
|
|
1546
|
+
used_by = record.get('used_by', None)
|
|
1547
|
+
used_by_str = str(used_by) if used_by is not None else '-'
|
|
1055
1548
|
|
|
1056
1549
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
|
1057
1550
|
'handle']
|
|
1058
1551
|
if replica_handle is not None:
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1552
|
+
infra = replica_handle.launched_resources.infra.formatted_str()
|
|
1553
|
+
simplified = not show_all
|
|
1554
|
+
resources_str_simple, resources_str_full = (
|
|
1555
|
+
resources_utils.get_readable_resources_repr(
|
|
1556
|
+
replica_handle, simplified_only=simplified))
|
|
1557
|
+
if simplified:
|
|
1558
|
+
resources_str = resources_str_simple
|
|
1559
|
+
else:
|
|
1560
|
+
assert resources_str_full is not None
|
|
1561
|
+
resources_str = resources_str_full
|
|
1065
1562
|
|
|
1066
1563
|
replica_values = [
|
|
1067
1564
|
service_name,
|
|
@@ -1069,18 +1566,20 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1069
1566
|
version,
|
|
1070
1567
|
replica_endpoint,
|
|
1071
1568
|
launched_at,
|
|
1569
|
+
infra,
|
|
1072
1570
|
resources_str,
|
|
1073
1571
|
status_str,
|
|
1074
|
-
region,
|
|
1075
1572
|
]
|
|
1076
|
-
if
|
|
1077
|
-
replica_values.append(
|
|
1573
|
+
if pool:
|
|
1574
|
+
replica_values.append(used_by_str)
|
|
1575
|
+
replica_values.pop(3)
|
|
1078
1576
|
replica_table.add_row(replica_values)
|
|
1079
1577
|
|
|
1080
1578
|
return f'{replica_table}{truncate_hint}'
|
|
1081
1579
|
|
|
1082
1580
|
|
|
1083
1581
|
# =========================== CodeGen for Sky Serve ===========================
|
|
1582
|
+
# TODO (kyuds): deprecate and remove serve codegen entirely.
|
|
1084
1583
|
|
|
1085
1584
|
|
|
1086
1585
|
# TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
|
|
@@ -1099,13 +1598,16 @@ class ServeCodeGen:
|
|
|
1099
1598
|
'from sky.serve import serve_state',
|
|
1100
1599
|
'from sky.serve import serve_utils',
|
|
1101
1600
|
'from sky.serve import constants',
|
|
1601
|
+
'serve_version = constants.SERVE_VERSION',
|
|
1102
1602
|
]
|
|
1103
1603
|
|
|
1104
1604
|
@classmethod
|
|
1105
|
-
def get_service_status(cls, service_names: Optional[List[str]]
|
|
1605
|
+
def get_service_status(cls, service_names: Optional[List[str]],
|
|
1606
|
+
pool: bool) -> str:
|
|
1106
1607
|
code = [
|
|
1107
|
-
f'
|
|
1108
|
-
'
|
|
1608
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1609
|
+
f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
|
|
1610
|
+
'**kwargs)', 'print(msg, end="", flush=True)'
|
|
1109
1611
|
]
|
|
1110
1612
|
return cls._build(code)
|
|
1111
1613
|
|
|
@@ -1118,11 +1620,12 @@ class ServeCodeGen:
|
|
|
1118
1620
|
return cls._build(code)
|
|
1119
1621
|
|
|
1120
1622
|
@classmethod
|
|
1121
|
-
def terminate_services(cls, service_names: Optional[List[str]],
|
|
1122
|
-
|
|
1623
|
+
def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
|
|
1624
|
+
pool: bool) -> str:
|
|
1123
1625
|
code = [
|
|
1626
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1124
1627
|
f'msg = serve_utils.terminate_services({service_names!r}, '
|
|
1125
|
-
f'purge={purge})', 'print(msg, end="", flush=True)'
|
|
1628
|
+
f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
|
|
1126
1629
|
]
|
|
1127
1630
|
return cls._build(code)
|
|
1128
1631
|
|
|
@@ -1139,29 +1642,48 @@ class ServeCodeGen:
|
|
|
1139
1642
|
return cls._build(code)
|
|
1140
1643
|
|
|
1141
1644
|
@classmethod
|
|
1142
|
-
def wait_service_registration(cls, service_name: str, job_id: int
|
|
1645
|
+
def wait_service_registration(cls, service_name: str, job_id: int,
|
|
1646
|
+
pool: bool) -> str:
|
|
1143
1647
|
code = [
|
|
1648
|
+
f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
|
|
1144
1649
|
'msg = serve_utils.wait_service_registration('
|
|
1145
|
-
f'{service_name!r}, {job_id}
|
|
1650
|
+
f'{service_name!r}, {job_id}, **kwargs)',
|
|
1651
|
+
'print(msg, end="", flush=True)'
|
|
1146
1652
|
]
|
|
1147
1653
|
return cls._build(code)
|
|
1148
1654
|
|
|
1149
1655
|
@classmethod
|
|
1150
1656
|
def stream_replica_logs(cls, service_name: str, replica_id: int,
|
|
1151
|
-
follow: bool
|
|
1657
|
+
follow: bool, tail: Optional[int],
|
|
1658
|
+
pool: bool) -> str:
|
|
1152
1659
|
code = [
|
|
1660
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1153
1661
|
'msg = serve_utils.stream_replica_logs('
|
|
1154
|
-
f'{service_name!r}, {replica_id!r}, follow={follow}
|
|
1155
|
-
'print(msg, flush=True)'
|
|
1662
|
+
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
|
|
1663
|
+
'**kwargs)', 'print(msg, flush=True)'
|
|
1156
1664
|
]
|
|
1157
1665
|
return cls._build(code)
|
|
1158
1666
|
|
|
1159
1667
|
@classmethod
|
|
1160
1668
|
def stream_serve_process_logs(cls, service_name: str,
|
|
1161
|
-
stream_controller: bool, follow: bool
|
|
1669
|
+
stream_controller: bool, follow: bool,
|
|
1670
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1162
1671
|
code = [
|
|
1672
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1163
1673
|
f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
|
|
1164
|
-
f'{stream_controller}, follow={follow}
|
|
1674
|
+
f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
|
|
1675
|
+
'print(msg, flush=True)'
|
|
1676
|
+
]
|
|
1677
|
+
return cls._build(code)
|
|
1678
|
+
|
|
1679
|
+
@classmethod
|
|
1680
|
+
def update_service(cls, service_name: str, version: int, mode: str,
|
|
1681
|
+
pool: bool) -> str:
|
|
1682
|
+
code = [
|
|
1683
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1684
|
+
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1685
|
+
f'{version}, mode={mode!r}, **kwargs)',
|
|
1686
|
+
'print(msg, end="", flush=True)',
|
|
1165
1687
|
]
|
|
1166
1688
|
return cls._build(code)
|
|
1167
1689
|
|
|
@@ -1175,12 +1697,3 @@ class ServeCodeGen:
|
|
|
1175
1697
|
f'"{common_utils.get_user_hash()}"; '
|
|
1176
1698
|
f'{skylet_constants.SKY_PYTHON_CMD} '
|
|
1177
1699
|
f'-u -c {shlex.quote(generated_code)}')
|
|
1178
|
-
|
|
1179
|
-
@classmethod
|
|
1180
|
-
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
|
1181
|
-
code = [
|
|
1182
|
-
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1183
|
-
f'{version}, mode={mode!r})',
|
|
1184
|
-
'print(msg, end="", flush=True)',
|
|
1185
|
-
]
|
|
1186
|
-
return cls._build(code)
|