skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import collections
|
|
4
4
|
import dataclasses
|
|
5
|
+
import datetime
|
|
5
6
|
import enum
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
@@ -9,11 +10,11 @@ import pickle
|
|
|
9
10
|
import re
|
|
10
11
|
import shlex
|
|
11
12
|
import shutil
|
|
12
|
-
import threading
|
|
13
13
|
import time
|
|
14
|
+
import traceback
|
|
14
15
|
import typing
|
|
15
|
-
from typing import (Any, Callable, DefaultDict,
|
|
16
|
-
Optional, TextIO, Type,
|
|
16
|
+
from typing import (Any, Callable, DefaultDict, Deque, Dict, Iterator, List,
|
|
17
|
+
Optional, TextIO, Type, Union)
|
|
17
18
|
import uuid
|
|
18
19
|
|
|
19
20
|
import colorama
|
|
@@ -22,19 +23,25 @@ import filelock
|
|
|
22
23
|
from sky import backends
|
|
23
24
|
from sky import exceptions
|
|
24
25
|
from sky import global_user_state
|
|
26
|
+
from sky import sky_logging
|
|
27
|
+
from sky import skypilot_config
|
|
25
28
|
from sky.adaptors import common as adaptors_common
|
|
29
|
+
from sky.jobs import state as managed_job_state
|
|
26
30
|
from sky.serve import constants
|
|
27
31
|
from sky.serve import serve_state
|
|
28
32
|
from sky.serve import spot_placer
|
|
29
33
|
from sky.skylet import constants as skylet_constants
|
|
30
34
|
from sky.skylet import job_lib
|
|
31
35
|
from sky.utils import annotations
|
|
36
|
+
from sky.utils import command_runner
|
|
32
37
|
from sky.utils import common_utils
|
|
38
|
+
from sky.utils import controller_utils
|
|
33
39
|
from sky.utils import log_utils
|
|
34
40
|
from sky.utils import message_utils
|
|
35
41
|
from sky.utils import resources_utils
|
|
36
42
|
from sky.utils import status_lib
|
|
37
43
|
from sky.utils import ux_utils
|
|
44
|
+
from sky.utils import yaml_utils
|
|
38
45
|
|
|
39
46
|
if typing.TYPE_CHECKING:
|
|
40
47
|
import fastapi
|
|
@@ -47,23 +54,19 @@ else:
|
|
|
47
54
|
psutil = adaptors_common.LazyImport('psutil')
|
|
48
55
|
requests = adaptors_common.LazyImport('requests')
|
|
49
56
|
|
|
50
|
-
|
|
51
|
-
@annotations.lru_cache(scope='request')
|
|
52
|
-
def get_num_service_threshold():
|
|
53
|
-
"""Get number of services threshold, calculating it only when needed."""
|
|
54
|
-
system_memory_gb = psutil.virtual_memory().total // (1024**3)
|
|
55
|
-
return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
|
|
56
|
-
|
|
57
|
+
logger = sky_logging.init_logger(__name__)
|
|
57
58
|
|
|
58
59
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
59
60
|
|
|
60
|
-
# NOTE(dev): We assume log
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
fr'
|
|
66
|
-
|
|
61
|
+
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
62
|
+
# when changing UX as this assumption is used to expand some log files while
|
|
63
|
+
# ignoring others.
|
|
64
|
+
_SKYPILOT_LOG_HINT = r'.*sky api logs -l'
|
|
65
|
+
_SKYPILOT_PROVISION_API_LOG_PATTERN = (
|
|
66
|
+
fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
|
|
67
|
+
# New hint pattern for provision logs
|
|
68
|
+
_SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
|
|
69
|
+
_SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
|
|
67
70
|
|
|
68
71
|
# TODO(tian): Find all existing replica id and print here.
|
|
69
72
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
|
@@ -154,50 +157,6 @@ _SIGNAL_TO_ERROR = {
|
|
|
154
157
|
UserSignal.TERMINATE: exceptions.ServeUserTerminatedError,
|
|
155
158
|
}
|
|
156
159
|
|
|
157
|
-
# pylint: disable=invalid-name
|
|
158
|
-
KeyType = TypeVar('KeyType')
|
|
159
|
-
ValueType = TypeVar('ValueType')
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
# Google style guide: Do not rely on the atomicity of built-in types.
|
|
163
|
-
# Our launch and down process pool will be used by multiple threads,
|
|
164
|
-
# therefore we need to use a thread-safe dict.
|
|
165
|
-
# see https://google.github.io/styleguide/pyguide.html#218-threading
|
|
166
|
-
class ThreadSafeDict(Generic[KeyType, ValueType]):
|
|
167
|
-
"""A thread-safe dict."""
|
|
168
|
-
|
|
169
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
170
|
-
self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
|
|
171
|
-
self._lock = threading.Lock()
|
|
172
|
-
|
|
173
|
-
def __getitem__(self, key: KeyType) -> ValueType:
|
|
174
|
-
with self._lock:
|
|
175
|
-
return self._dict.__getitem__(key)
|
|
176
|
-
|
|
177
|
-
def __setitem__(self, key: KeyType, value: ValueType) -> None:
|
|
178
|
-
with self._lock:
|
|
179
|
-
return self._dict.__setitem__(key, value)
|
|
180
|
-
|
|
181
|
-
def __delitem__(self, key: KeyType) -> None:
|
|
182
|
-
with self._lock:
|
|
183
|
-
return self._dict.__delitem__(key)
|
|
184
|
-
|
|
185
|
-
def __len__(self) -> int:
|
|
186
|
-
with self._lock:
|
|
187
|
-
return self._dict.__len__()
|
|
188
|
-
|
|
189
|
-
def __contains__(self, key: KeyType) -> bool:
|
|
190
|
-
with self._lock:
|
|
191
|
-
return self._dict.__contains__(key)
|
|
192
|
-
|
|
193
|
-
def items(self):
|
|
194
|
-
with self._lock:
|
|
195
|
-
return self._dict.items()
|
|
196
|
-
|
|
197
|
-
def values(self):
|
|
198
|
-
with self._lock:
|
|
199
|
-
return self._dict.values()
|
|
200
|
-
|
|
201
160
|
|
|
202
161
|
class RequestsAggregator:
|
|
203
162
|
"""Base class for request aggregator."""
|
|
@@ -244,7 +203,120 @@ class RequestTimestamp(RequestsAggregator):
|
|
|
244
203
|
return f'RequestTimestamp(timestamps={self.timestamps})'
|
|
245
204
|
|
|
246
205
|
|
|
247
|
-
def
|
|
206
|
+
def get_service_filelock_path(pool: str) -> str:
|
|
207
|
+
path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
|
|
208
|
+
'pool.lock').expanduser().absolute()
|
|
209
|
+
path.parents[0].mkdir(parents=True, exist_ok=True)
|
|
210
|
+
return str(path)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
214
|
+
pool: bool) -> None:
|
|
215
|
+
"""Validate the consolidation mode config."""
|
|
216
|
+
# Check whether the consolidation mode config is changed.
|
|
217
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
218
|
+
if current_is_consolidation_mode:
|
|
219
|
+
controller_cn = controller.cluster_name
|
|
220
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
221
|
+
logger.warning(
|
|
222
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
223
|
+
f'{controller.controller_type} is enabled, but the controller '
|
|
224
|
+
f'cluster {controller_cn} is still running. Please terminate '
|
|
225
|
+
'the controller cluster first.'
|
|
226
|
+
f'{colorama.Style.RESET_ALL}')
|
|
227
|
+
else:
|
|
228
|
+
noun = 'pool' if pool else 'service'
|
|
229
|
+
all_services = [
|
|
230
|
+
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
231
|
+
]
|
|
232
|
+
if all_services:
|
|
233
|
+
logger.warning(
|
|
234
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
235
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
236
|
+
f'still {len(all_services)} {noun}s running. Please terminate '
|
|
237
|
+
f'those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
|
241
|
+
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
242
|
+
# Use jobs config for pool consolidation mode.
|
|
243
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
244
|
+
consolidation_mode = skypilot_config.get_nested(
|
|
245
|
+
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
246
|
+
default_value=False)
|
|
247
|
+
if os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
248
|
+
# if we are in the job controller, we must always be in consolidation
|
|
249
|
+
# mode.
|
|
250
|
+
return True
|
|
251
|
+
# We should only do this check on API server, as the controller will not
|
|
252
|
+
# have related config and will always seemingly disabled for consolidation
|
|
253
|
+
# mode. Check #6611 for more details.
|
|
254
|
+
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
255
|
+
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
256
|
+
return consolidation_mode
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def ha_recovery_for_consolidation_mode(pool: bool):
|
|
260
|
+
"""Recovery logic for HA mode."""
|
|
261
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
|
262
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
|
263
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
264
|
+
runner = command_runner.LocalProcessCommandRunner()
|
|
265
|
+
noun = 'pool' if pool else 'serve'
|
|
266
|
+
capnoun = noun.capitalize()
|
|
267
|
+
prefix = f'{noun}_'
|
|
268
|
+
with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
|
|
269
|
+
'w',
|
|
270
|
+
encoding='utf-8') as f:
|
|
271
|
+
start = time.time()
|
|
272
|
+
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
|
273
|
+
for service_name in serve_state.get_glob_service_names(None):
|
|
274
|
+
svc = _get_service_status(service_name,
|
|
275
|
+
pool=pool,
|
|
276
|
+
with_replica_info=False)
|
|
277
|
+
if svc is None:
|
|
278
|
+
continue
|
|
279
|
+
controller_pid = svc['controller_pid']
|
|
280
|
+
if controller_pid is not None:
|
|
281
|
+
try:
|
|
282
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
283
|
+
f.write(f'Controller pid {controller_pid} for '
|
|
284
|
+
f'{noun} {service_name} is still running. '
|
|
285
|
+
'Skipping recovery.\n')
|
|
286
|
+
continue
|
|
287
|
+
except Exception: # pylint: disable=broad-except
|
|
288
|
+
# _controller_process_alive may raise if psutil fails; we
|
|
289
|
+
# should not crash the recovery logic because of this.
|
|
290
|
+
f.write('Error checking controller pid '
|
|
291
|
+
f'{controller_pid} for {noun} {service_name}\n')
|
|
292
|
+
|
|
293
|
+
script = serve_state.get_ha_recovery_script(service_name)
|
|
294
|
+
if script is None:
|
|
295
|
+
f.write(f'{capnoun} {service_name}\'s recovery script does '
|
|
296
|
+
'not exist. Skipping recovery.\n')
|
|
297
|
+
continue
|
|
298
|
+
rc, out, err = runner.run(script, require_outputs=True)
|
|
299
|
+
if rc:
|
|
300
|
+
f.write(f'Recovery script returned {rc}. '
|
|
301
|
+
f'Output: {out}\nError: {err}\n')
|
|
302
|
+
f.write(f'{capnoun} {service_name} completed recovery at '
|
|
303
|
+
f'{datetime.datetime.now()}\n')
|
|
304
|
+
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
305
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _controller_process_alive(pid: int, service_name: str) -> bool:
|
|
309
|
+
"""Check if the controller process is alive."""
|
|
310
|
+
try:
|
|
311
|
+
process = psutil.Process(pid)
|
|
312
|
+
cmd_str = ' '.join(process.cmdline())
|
|
313
|
+
return process.is_running(
|
|
314
|
+
) and f'--service-name {service_name}' in cmd_str
|
|
315
|
+
except psutil.NoSuchProcess:
|
|
316
|
+
return False
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
248
320
|
"""Validate the task for Sky Serve.
|
|
249
321
|
|
|
250
322
|
Args:
|
|
@@ -267,19 +339,43 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
267
339
|
'use `dynamic_ondemand_fallback` or set '
|
|
268
340
|
'base_ondemand_fallback_replicas.')
|
|
269
341
|
|
|
342
|
+
field_name = 'service' if not pool else 'pool'
|
|
270
343
|
if task.service is None:
|
|
271
344
|
with ux_utils.print_exception_no_traceback():
|
|
272
|
-
raise RuntimeError('
|
|
345
|
+
raise RuntimeError(f'{field_name.capitalize()} section not found.')
|
|
346
|
+
|
|
347
|
+
if pool != task.service.pool:
|
|
348
|
+
with ux_utils.print_exception_no_traceback():
|
|
349
|
+
raise ValueError(f'{field_name.capitalize()} section in the YAML '
|
|
350
|
+
f'file does not match the pool argument. '
|
|
351
|
+
f'To fix, add a valid `{field_name}` field.')
|
|
273
352
|
|
|
274
353
|
policy_description = ('on-demand'
|
|
275
354
|
if task.service.dynamic_ondemand_fallback else 'spot')
|
|
276
355
|
for resource in list(task.resources):
|
|
277
356
|
if resource.job_recovery is not None:
|
|
357
|
+
sys_name = 'SkyServe' if not pool else 'Pool'
|
|
278
358
|
with ux_utils.print_exception_no_traceback():
|
|
279
|
-
raise ValueError('job_recovery is disabled for
|
|
280
|
-
'
|
|
359
|
+
raise ValueError(f'job_recovery is disabled for {sys_name}. '
|
|
360
|
+
f'{sys_name} will replenish preempted spot '
|
|
281
361
|
f'with {policy_description} instances.')
|
|
282
362
|
|
|
363
|
+
if pool:
|
|
364
|
+
accelerators = set()
|
|
365
|
+
for resource in task.resources:
|
|
366
|
+
if resource.accelerators is not None:
|
|
367
|
+
if isinstance(resource.accelerators, str):
|
|
368
|
+
accelerators.add(resource.accelerators)
|
|
369
|
+
elif isinstance(resource.accelerators, dict):
|
|
370
|
+
accelerators.update(resource.accelerators.keys())
|
|
371
|
+
elif isinstance(resource.accelerators, list):
|
|
372
|
+
accelerators.update(resource.accelerators)
|
|
373
|
+
if len(accelerators) > 1:
|
|
374
|
+
with ux_utils.print_exception_no_traceback():
|
|
375
|
+
raise ValueError('Heterogeneous clusters are not supported for '
|
|
376
|
+
'pools please specify one accelerator '
|
|
377
|
+
'for all workers.')
|
|
378
|
+
|
|
283
379
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
284
380
|
# is valid for spot placer.
|
|
285
381
|
spot_placer.SpotPlacer.from_task(task.service, task)
|
|
@@ -300,7 +396,7 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
300
396
|
raise ValueError(
|
|
301
397
|
'`spot_placer` is only supported for spot resources. '
|
|
302
398
|
'Please explicitly specify `use_spot: true` in resources.')
|
|
303
|
-
if task.service.ports is None:
|
|
399
|
+
if not pool and task.service.ports is None:
|
|
304
400
|
requested_ports = list(
|
|
305
401
|
resources_utils.port_ranges_to_set(requested_resources.ports))
|
|
306
402
|
if len(requested_ports) != 1:
|
|
@@ -320,10 +416,16 @@ def validate_service_task(task: 'sky.Task') -> None:
|
|
|
320
416
|
f'Got multiple ports: {service_port} and '
|
|
321
417
|
f'{replica_ingress_port} in different resources. '
|
|
322
418
|
'Please specify the same port instead.')
|
|
419
|
+
if pool:
|
|
420
|
+
if (task.service.ports is not None or
|
|
421
|
+
requested_resources.ports is not None):
|
|
422
|
+
with ux_utils.print_exception_no_traceback():
|
|
423
|
+
raise ValueError('Cannot specify ports in a pool.')
|
|
323
424
|
|
|
324
425
|
|
|
325
|
-
def generate_service_name():
|
|
326
|
-
|
|
426
|
+
def generate_service_name(pool: bool = False):
|
|
427
|
+
noun = 'pool' if pool else 'service'
|
|
428
|
+
return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
|
|
327
429
|
|
|
328
430
|
|
|
329
431
|
def generate_remote_service_dir_name(service_name: str) -> str:
|
|
@@ -390,6 +492,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
|
|
|
390
492
|
|
|
391
493
|
|
|
392
494
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
|
495
|
+
# NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
|
|
496
|
+
# checking replica cluster existence. Be careful when changing it.
|
|
393
497
|
return f'{service_name}-{replica_id}'
|
|
394
498
|
|
|
395
499
|
|
|
@@ -425,26 +529,63 @@ def set_service_status_and_active_versions_from_replica(
|
|
|
425
529
|
active_versions=active_versions)
|
|
426
530
|
|
|
427
531
|
|
|
428
|
-
def update_service_status() -> None:
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
532
|
+
def update_service_status(pool: bool) -> None:
|
|
533
|
+
noun = 'pool' if pool else 'serve'
|
|
534
|
+
capnoun = noun.capitalize()
|
|
535
|
+
service_names = serve_state.get_glob_service_names(None)
|
|
536
|
+
for service_name in service_names:
|
|
537
|
+
record = _get_service_status(service_name,
|
|
538
|
+
pool=pool,
|
|
539
|
+
with_replica_info=False)
|
|
540
|
+
if record is None:
|
|
541
|
+
continue
|
|
542
|
+
service_status = record['status']
|
|
543
|
+
if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
|
|
432
544
|
# Skip services that is shutting down.
|
|
433
545
|
continue
|
|
434
|
-
controller_job_id = record['controller_job_id']
|
|
435
|
-
assert controller_job_id is not None
|
|
436
|
-
controller_status = job_lib.get_status(controller_job_id)
|
|
437
|
-
if controller_status is None or controller_status.is_terminal():
|
|
438
|
-
# If controller job is not running, set it as controller failed.
|
|
439
|
-
serve_state.set_service_status_and_active_versions(
|
|
440
|
-
record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
|
|
441
546
|
|
|
547
|
+
logger.info(f'Update {noun} status for {service_name!r} '
|
|
548
|
+
f'with status {service_status}')
|
|
549
|
+
|
|
550
|
+
controller_pid = record['controller_pid']
|
|
551
|
+
if controller_pid is None:
|
|
552
|
+
logger.info(f'{capnoun} {service_name!r} controller pid is None. '
|
|
553
|
+
f'Unexpected status {service_status}. Set to failure.')
|
|
554
|
+
elif controller_pid < 0:
|
|
555
|
+
# Backwards compatibility: this service was submitted when ray was
|
|
556
|
+
# still used for controller process management. We set the
|
|
557
|
+
# value_to_replace_existing_entries to -1 to indicate historical
|
|
558
|
+
# services.
|
|
559
|
+
# TODO(tian): Remove before 0.13.0.
|
|
560
|
+
controller_job_id = record['controller_job_id']
|
|
561
|
+
assert controller_job_id is not None
|
|
562
|
+
controller_status = job_lib.get_status(controller_job_id)
|
|
563
|
+
if (controller_status is not None and
|
|
564
|
+
not controller_status.is_terminal()):
|
|
565
|
+
continue
|
|
566
|
+
logger.info(f'Updating {noun} {service_name!r} in old version. '
|
|
567
|
+
f'SkyPilot job status: {controller_status}. '
|
|
568
|
+
'Set to failure.')
|
|
569
|
+
else:
|
|
570
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
571
|
+
# The controller is still running.
|
|
572
|
+
continue
|
|
573
|
+
logger.info(f'{capnoun} {service_name!r} controller pid '
|
|
574
|
+
f'{controller_pid} is not alive. Set to failure.')
|
|
575
|
+
|
|
576
|
+
# If controller job is not running, set it as controller failed.
|
|
577
|
+
serve_state.set_service_status_and_active_versions(
|
|
578
|
+
service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
|
|
442
579
|
|
|
443
|
-
|
|
444
|
-
|
|
580
|
+
|
|
581
|
+
def update_service_encoded(service_name: str, version: int, mode: str,
|
|
582
|
+
pool: bool) -> str:
|
|
583
|
+
noun = 'pool' if pool else 'service'
|
|
584
|
+
capnoun = noun.capitalize()
|
|
585
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
445
586
|
if service_status is None:
|
|
446
587
|
with ux_utils.print_exception_no_traceback():
|
|
447
|
-
raise ValueError(f'
|
|
588
|
+
raise ValueError(f'{capnoun} {service_name!r} does not exist.')
|
|
448
589
|
controller_port = service_status['controller_port']
|
|
449
590
|
resp = requests.post(
|
|
450
591
|
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
@@ -455,27 +596,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
|
|
455
596
|
})
|
|
456
597
|
if resp.status_code == 404:
|
|
457
598
|
with ux_utils.print_exception_no_traceback():
|
|
599
|
+
# This only happens for services since pool is added after the
|
|
600
|
+
# update feature is introduced.
|
|
458
601
|
raise ValueError(
|
|
459
602
|
'The service is up-ed in an old version and does not '
|
|
460
603
|
'support update. Please `sky serve down` '
|
|
461
604
|
'it first and relaunch the service. ')
|
|
462
605
|
elif resp.status_code == 400:
|
|
463
606
|
with ux_utils.print_exception_no_traceback():
|
|
464
|
-
raise ValueError(f'Client error during
|
|
607
|
+
raise ValueError(f'Client error during {noun} update: {resp.text}')
|
|
465
608
|
elif resp.status_code == 500:
|
|
466
609
|
with ux_utils.print_exception_no_traceback():
|
|
467
610
|
raise RuntimeError(
|
|
468
|
-
f'Server error during
|
|
611
|
+
f'Server error during {noun} update: {resp.text}')
|
|
469
612
|
elif resp.status_code != 200:
|
|
470
613
|
with ux_utils.print_exception_no_traceback():
|
|
471
|
-
raise ValueError(f'Failed to update
|
|
614
|
+
raise ValueError(f'Failed to update {noun}: {resp.text}')
|
|
472
615
|
|
|
473
616
|
service_msg = resp.json()['message']
|
|
474
617
|
return message_utils.encode_payload(service_msg)
|
|
475
618
|
|
|
476
619
|
|
|
477
620
|
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
478
|
-
|
|
621
|
+
# TODO(tian): Currently pool does not support terminating replica.
|
|
622
|
+
service_status = _get_service_status(service_name, pool=False)
|
|
479
623
|
if service_status is None:
|
|
480
624
|
with ux_utils.print_exception_no_traceback():
|
|
481
625
|
raise ValueError(f'Service {service_name!r} does not exist.')
|
|
@@ -504,8 +648,21 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
|
504
648
|
return message
|
|
505
649
|
|
|
506
650
|
|
|
651
|
+
def get_yaml_content(service_name: str, version: int) -> str:
|
|
652
|
+
yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
653
|
+
if yaml_content is not None:
|
|
654
|
+
return yaml_content
|
|
655
|
+
# Backward compatibility for old service records that
|
|
656
|
+
# does not dump the yaml content to version database.
|
|
657
|
+
# TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
|
|
658
|
+
latest_yaml_path = generate_task_yaml_file_name(service_name, version)
|
|
659
|
+
with open(latest_yaml_path, 'r', encoding='utf-8') as f:
|
|
660
|
+
return f.read()
|
|
661
|
+
|
|
662
|
+
|
|
507
663
|
def _get_service_status(
|
|
508
664
|
service_name: str,
|
|
665
|
+
pool: bool,
|
|
509
666
|
with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
|
|
510
667
|
"""Get the status dict of the service.
|
|
511
668
|
|
|
@@ -520,34 +677,105 @@ def _get_service_status(
|
|
|
520
677
|
record = serve_state.get_service_from_name(service_name)
|
|
521
678
|
if record is None:
|
|
522
679
|
return None
|
|
680
|
+
if record['pool'] != pool:
|
|
681
|
+
return None
|
|
682
|
+
|
|
683
|
+
record['pool_yaml'] = ''
|
|
684
|
+
if record['pool']:
|
|
685
|
+
version = record['version']
|
|
686
|
+
try:
|
|
687
|
+
yaml_content = get_yaml_content(service_name, version)
|
|
688
|
+
raw_yaml_config = yaml_utils.read_yaml_str(yaml_content)
|
|
689
|
+
except Exception as e: # pylint: disable=broad-except
|
|
690
|
+
# If this is a consolidation mode running without an PVC, the file
|
|
691
|
+
# might lost after an API server update (restart). In such case, we
|
|
692
|
+
# don't want it to crash the command. Fall back to an empty string.
|
|
693
|
+
logger.error(f'Failed to read YAML for service {service_name} '
|
|
694
|
+
f'with version {version}: {e}')
|
|
695
|
+
record['pool_yaml'] = ''
|
|
696
|
+
else:
|
|
697
|
+
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
698
|
+
if original_config is None:
|
|
699
|
+
# Fall back to old display format.
|
|
700
|
+
original_config = raw_yaml_config
|
|
701
|
+
original_config.pop('run', None)
|
|
702
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
703
|
+
if svc is not None:
|
|
704
|
+
svc.pop('pool', None) # Remove pool from service config
|
|
705
|
+
original_config['pool'] = svc # Add pool to root config
|
|
706
|
+
else:
|
|
707
|
+
original_config = yaml_utils.safe_load(original_config)
|
|
708
|
+
record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
|
|
709
|
+
|
|
710
|
+
record['target_num_replicas'] = 0
|
|
711
|
+
try:
|
|
712
|
+
controller_port = record['controller_port']
|
|
713
|
+
resp = requests.get(
|
|
714
|
+
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
|
715
|
+
'/autoscaler/info')
|
|
716
|
+
record['target_num_replicas'] = resp.json()['target_num_replicas']
|
|
717
|
+
except requests.exceptions.RequestException:
|
|
718
|
+
record['target_num_replicas'] = None
|
|
719
|
+
except Exception as e: # pylint: disable=broad-except
|
|
720
|
+
logger.error(f'Failed to get autoscaler info for {service_name}: '
|
|
721
|
+
f'{common_utils.format_exception(e)}\n'
|
|
722
|
+
f'Traceback: {traceback.format_exc()}')
|
|
723
|
+
|
|
523
724
|
if with_replica_info:
|
|
524
725
|
record['replica_info'] = [
|
|
525
|
-
info.to_info_dict(with_handle=True)
|
|
726
|
+
info.to_info_dict(with_handle=True, with_url=not pool)
|
|
526
727
|
for info in serve_state.get_replica_infos(service_name)
|
|
527
728
|
]
|
|
729
|
+
if pool:
|
|
730
|
+
for replica_info in record['replica_info']:
|
|
731
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
732
|
+
service_name, replica_info['name'])
|
|
733
|
+
replica_info['used_by'] = job_ids[0] if job_ids else None
|
|
528
734
|
return record
|
|
529
735
|
|
|
530
736
|
|
|
531
|
-
def
|
|
737
|
+
def get_service_status_pickled(service_names: Optional[List[str]],
|
|
738
|
+
pool: bool) -> List[Dict[str, str]]:
|
|
532
739
|
service_statuses: List[Dict[str, str]] = []
|
|
533
740
|
if service_names is None:
|
|
534
741
|
# Get all service names
|
|
535
742
|
service_names = serve_state.get_glob_service_names(None)
|
|
536
743
|
for service_name in service_names:
|
|
537
|
-
service_status = _get_service_status(service_name)
|
|
744
|
+
service_status = _get_service_status(service_name, pool=pool)
|
|
538
745
|
if service_status is None:
|
|
539
746
|
continue
|
|
540
747
|
service_statuses.append({
|
|
541
748
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
|
542
749
|
for k, v in service_status.items()
|
|
543
750
|
})
|
|
751
|
+
return sorted(service_statuses, key=lambda x: x['name'])
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
755
|
+
def get_service_status_encoded(service_names: Optional[List[str]],
|
|
756
|
+
pool: bool) -> str:
|
|
544
757
|
# We have to use payload_type here to avoid the issue of
|
|
545
758
|
# message_utils.decode_payload() not being able to correctly decode the
|
|
546
759
|
# message with <sky-payload> tags.
|
|
760
|
+
service_statuses = get_service_status_pickled(service_names, pool)
|
|
547
761
|
return message_utils.encode_payload(service_statuses,
|
|
548
762
|
payload_type='service_status')
|
|
549
763
|
|
|
550
764
|
|
|
765
|
+
def unpickle_service_status(
|
|
766
|
+
payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
|
|
767
|
+
service_statuses: List[Dict[str, Any]] = []
|
|
768
|
+
for service_status in payload:
|
|
769
|
+
if not isinstance(service_status, dict):
|
|
770
|
+
raise ValueError(f'Invalid service status: {service_status}')
|
|
771
|
+
service_statuses.append({
|
|
772
|
+
k: pickle.loads(base64.b64decode(v))
|
|
773
|
+
for k, v in service_status.items()
|
|
774
|
+
})
|
|
775
|
+
return service_statuses
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
551
779
|
def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
552
780
|
try:
|
|
553
781
|
service_statuses_encoded = message_utils.decode_payload(
|
|
@@ -559,26 +787,85 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
|
559
787
|
service_statuses_encoded = message_utils.decode_payload(payload)
|
|
560
788
|
else:
|
|
561
789
|
raise
|
|
562
|
-
|
|
563
|
-
for service_status in service_statuses_encoded:
|
|
564
|
-
if not isinstance(service_status, dict):
|
|
565
|
-
raise ValueError(f'Invalid service status: {service_status}')
|
|
566
|
-
service_statuses.append({
|
|
567
|
-
k: pickle.loads(base64.b64decode(v))
|
|
568
|
-
for k, v in service_status.items()
|
|
569
|
-
})
|
|
570
|
-
return service_statuses
|
|
790
|
+
return unpickle_service_status(service_statuses_encoded)
|
|
571
791
|
|
|
572
792
|
|
|
793
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
573
794
|
def add_version_encoded(service_name: str) -> str:
|
|
574
795
|
new_version = serve_state.add_version(service_name)
|
|
575
796
|
return message_utils.encode_payload(new_version)
|
|
576
797
|
|
|
577
798
|
|
|
799
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
578
800
|
def load_version_string(payload: str) -> str:
|
|
579
801
|
return message_utils.decode_payload(payload)
|
|
580
802
|
|
|
581
803
|
|
|
804
|
+
def get_ready_replicas(
|
|
805
|
+
service_name: str) -> List['replica_managers.ReplicaInfo']:
|
|
806
|
+
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
807
|
+
return [
|
|
808
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
809
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
810
|
+
]
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
814
|
+
"""Get the next available cluster name from idle replicas.
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
service_name: The name of the service.
|
|
818
|
+
job_id: Optional job ID to associate with the acquired cluster.
|
|
819
|
+
If None, a placeholder will be used.
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
The cluster name if an idle replica is found, None otherwise.
|
|
823
|
+
"""
|
|
824
|
+
# Check if service exists
|
|
825
|
+
service_status = _get_service_status(service_name,
|
|
826
|
+
pool=True,
|
|
827
|
+
with_replica_info=False)
|
|
828
|
+
if service_status is None:
|
|
829
|
+
logger.error(f'Service {service_name!r} does not exist.')
|
|
830
|
+
return None
|
|
831
|
+
if not service_status['pool']:
|
|
832
|
+
logger.error(f'Service {service_name!r} is not a pool.')
|
|
833
|
+
return None
|
|
834
|
+
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
835
|
+
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
836
|
+
ready_replicas = get_ready_replicas(service_name)
|
|
837
|
+
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
838
|
+
for replica_info in ready_replicas:
|
|
839
|
+
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
840
|
+
service_name, replica_info.cluster_name)
|
|
841
|
+
# TODO(tian): Make it resources aware. Currently we allow and only
|
|
842
|
+
# allow one job per replica. In the following PR, we should:
|
|
843
|
+
# i) When the replica is launched with `any_of` resources (
|
|
844
|
+
# replicas can have different resources), we should check if
|
|
845
|
+
# the resources that jobs require are available on the replica.
|
|
846
|
+
# e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
|
|
847
|
+
# should only goes to replica with A100.
|
|
848
|
+
# ii) When a job only requires a subset of the resources on the
|
|
849
|
+
# replica, each replica should be able to handle multiple jobs
|
|
850
|
+
# at the same time. e.g., if a job requires A100:1 on a A100:8
|
|
851
|
+
# pool, it should be able to run 4 jobs at the same time.
|
|
852
|
+
if not jobs_on_replica:
|
|
853
|
+
idle_replicas.append(replica_info)
|
|
854
|
+
if not idle_replicas:
|
|
855
|
+
logger.info(f'No idle replicas found for pool {service_name!r}')
|
|
856
|
+
return None
|
|
857
|
+
|
|
858
|
+
# Select the first idle replica.
|
|
859
|
+
# TODO(tian): "Load balancing" policy.
|
|
860
|
+
replica_info = idle_replicas[0]
|
|
861
|
+
logger.info(f'Selected replica {replica_info.replica_id} with cluster '
|
|
862
|
+
f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
|
|
863
|
+
f'{service_name!r}')
|
|
864
|
+
managed_job_state.set_current_cluster_name(job_id,
|
|
865
|
+
replica_info.cluster_name)
|
|
866
|
+
return replica_info.cluster_name
|
|
867
|
+
|
|
868
|
+
|
|
582
869
|
def _terminate_failed_services(
|
|
583
870
|
service_name: str,
|
|
584
871
|
service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
|
|
@@ -598,8 +885,8 @@ def _terminate_failed_services(
|
|
|
598
885
|
# replicas, so we don't need to try again here.
|
|
599
886
|
for replica_info in serve_state.get_replica_infos(service_name):
|
|
600
887
|
# TODO(tian): Refresh latest status of the cluster.
|
|
601
|
-
if global_user_state.
|
|
602
|
-
replica_info.cluster_name)
|
|
888
|
+
if global_user_state.cluster_with_name_exists(
|
|
889
|
+
replica_info.cluster_name):
|
|
603
890
|
remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
|
|
604
891
|
serve_state.remove_replica(service_name, replica_info.replica_id)
|
|
605
892
|
|
|
@@ -608,9 +895,11 @@ def _terminate_failed_services(
|
|
|
608
895
|
shutil.rmtree(service_dir)
|
|
609
896
|
serve_state.remove_service(service_name)
|
|
610
897
|
serve_state.delete_all_versions(service_name)
|
|
898
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
611
899
|
|
|
612
900
|
if not remaining_replica_clusters:
|
|
613
901
|
return None
|
|
902
|
+
# TODO(tian): Try to terminate those replica clusters.
|
|
614
903
|
remaining_identity = ', '.join(remaining_replica_clusters)
|
|
615
904
|
return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
|
|
616
905
|
f'failed status ({service_status}). This may indicate a resource '
|
|
@@ -618,17 +907,38 @@ def _terminate_failed_services(
|
|
|
618
907
|
f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
|
|
619
908
|
|
|
620
909
|
|
|
621
|
-
def terminate_services(service_names: Optional[List[str]], purge: bool
|
|
910
|
+
def terminate_services(service_names: Optional[List[str]], purge: bool,
|
|
911
|
+
pool: bool) -> str:
|
|
912
|
+
noun = 'pool' if pool else 'service'
|
|
913
|
+
capnoun = noun.capitalize()
|
|
622
914
|
service_names = serve_state.get_glob_service_names(service_names)
|
|
623
915
|
terminated_service_names: List[str] = []
|
|
624
916
|
messages: List[str] = []
|
|
625
917
|
for service_name in service_names:
|
|
626
918
|
service_status = _get_service_status(service_name,
|
|
919
|
+
pool=pool,
|
|
627
920
|
with_replica_info=False)
|
|
921
|
+
if service_status is None:
|
|
922
|
+
continue
|
|
628
923
|
if (service_status is not None and service_status['status']
|
|
629
924
|
== serve_state.ServiceStatus.SHUTTING_DOWN):
|
|
630
925
|
# Already scheduled to be terminated.
|
|
631
926
|
continue
|
|
927
|
+
if pool:
|
|
928
|
+
nonterminal_job_ids = (
|
|
929
|
+
managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
|
|
930
|
+
if nonterminal_job_ids:
|
|
931
|
+
nonterminal_job_ids_str = ','.join(
|
|
932
|
+
str(job_id) for job_id in nonterminal_job_ids)
|
|
933
|
+
num_nonterminal_jobs = len(nonterminal_job_ids)
|
|
934
|
+
messages.append(
|
|
935
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
|
|
936
|
+
f'{num_nonterminal_jobs} nonterminal jobs: '
|
|
937
|
+
f'{nonterminal_job_ids_str}. To terminate the {noun}, '
|
|
938
|
+
f'please run `sky jobs cancel --pool {service_name}` to '
|
|
939
|
+
'cancel all jobs in the pool first.'
|
|
940
|
+
f'{colorama.Style.RESET_ALL}')
|
|
941
|
+
continue
|
|
632
942
|
# If the `services` and `version_specs` table are not aligned, it might
|
|
633
943
|
# result in a None service status. In this case, the controller process
|
|
634
944
|
# is not functioning as well and we should also use the
|
|
@@ -636,10 +946,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
636
946
|
# This is a safeguard for a rare case, that is accidentally abort
|
|
637
947
|
# between `serve_state.add_service` and
|
|
638
948
|
# `serve_state.add_or_update_version` in service.py.
|
|
639
|
-
|
|
949
|
+
purge_cmd = (f'sky jobs pool down {service_name} --purge'
|
|
950
|
+
if pool else f'sky serve down {service_name} --purge')
|
|
951
|
+
if (service_status['status']
|
|
640
952
|
in serve_state.ServiceStatus.failed_statuses()):
|
|
641
|
-
failed_status =
|
|
642
|
-
if service_status is not None else None)
|
|
953
|
+
failed_status = service_status['status']
|
|
643
954
|
if purge:
|
|
644
955
|
message = _terminate_failed_services(service_name,
|
|
645
956
|
failed_status)
|
|
@@ -647,11 +958,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
647
958
|
messages.append(message)
|
|
648
959
|
else:
|
|
649
960
|
messages.append(
|
|
650
|
-
f'{colorama.Fore.YELLOW}
|
|
961
|
+
f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
|
|
651
962
|
f'failed status ({failed_status}). Skipping '
|
|
652
963
|
'its termination as it could lead to a resource leak. '
|
|
653
|
-
f'(Use `
|
|
654
|
-
'forcefully terminate the service.)'
|
|
964
|
+
f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
|
|
655
965
|
f'{colorama.Style.RESET_ALL}')
|
|
656
966
|
# Don't add to terminated_service_names since it's not
|
|
657
967
|
# actually terminated.
|
|
@@ -668,17 +978,18 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
|
668
978
|
f.flush()
|
|
669
979
|
terminated_service_names.append(f'{service_name!r}')
|
|
670
980
|
if not terminated_service_names:
|
|
671
|
-
messages.append('No
|
|
981
|
+
messages.append(f'No {noun} to terminate.')
|
|
672
982
|
else:
|
|
673
|
-
identity_str = f'
|
|
983
|
+
identity_str = f'{capnoun} {terminated_service_names[0]} is'
|
|
674
984
|
if len(terminated_service_names) > 1:
|
|
675
985
|
terminated_service_names_str = ', '.join(terminated_service_names)
|
|
676
|
-
identity_str = f'
|
|
986
|
+
identity_str = f'{capnoun}s {terminated_service_names_str} are'
|
|
677
987
|
messages.append(f'{identity_str} scheduled to be terminated.')
|
|
678
988
|
return '\n'.join(messages)
|
|
679
989
|
|
|
680
990
|
|
|
681
|
-
def wait_service_registration(service_name: str, job_id: int
|
|
991
|
+
def wait_service_registration(service_name: str, job_id: int,
|
|
992
|
+
pool: bool) -> str:
|
|
682
993
|
"""Util function to call at the end of `sky.serve.up()`.
|
|
683
994
|
|
|
684
995
|
This function will:
|
|
@@ -691,49 +1002,67 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
|
691
1002
|
Returns:
|
|
692
1003
|
Encoded load balancer port assigned to the service.
|
|
693
1004
|
"""
|
|
1005
|
+
# TODO (kyuds): when codegen is fully deprecated, return the lb port
|
|
1006
|
+
# as an int directly instead of encoding it.
|
|
694
1007
|
start_time = time.time()
|
|
695
1008
|
setup_completed = False
|
|
1009
|
+
noun = 'pool' if pool else 'service'
|
|
696
1010
|
while True:
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
if
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
1011
|
+
# Only do this check for non-consolidation mode as consolidation mode
|
|
1012
|
+
# has no setup process.
|
|
1013
|
+
if not is_consolidation_mode(pool):
|
|
1014
|
+
job_status = job_lib.get_status(job_id)
|
|
1015
|
+
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
1016
|
+
# Wait for the controller process to finish setting up. It
|
|
1017
|
+
# can be slow if a lot cloud dependencies are being installed.
|
|
1018
|
+
if (time.time() - start_time >
|
|
1019
|
+
constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
|
|
1020
|
+
with ux_utils.print_exception_no_traceback():
|
|
1021
|
+
raise RuntimeError(
|
|
1022
|
+
f'Failed to start the controller process for '
|
|
1023
|
+
f'the {noun} {service_name!r} within '
|
|
1024
|
+
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
|
|
1025
|
+
f' seconds.')
|
|
1026
|
+
# No need to check the service status as the controller process
|
|
1027
|
+
# is still setting up.
|
|
1028
|
+
time.sleep(1)
|
|
1029
|
+
continue
|
|
714
1030
|
|
|
715
1031
|
if not setup_completed:
|
|
716
1032
|
setup_completed = True
|
|
717
1033
|
# Reset the start time to wait for the service to be registered.
|
|
718
1034
|
start_time = time.time()
|
|
719
1035
|
|
|
720
|
-
record =
|
|
1036
|
+
record = _get_service_status(service_name,
|
|
1037
|
+
pool=pool,
|
|
1038
|
+
with_replica_info=False)
|
|
721
1039
|
if record is not None:
|
|
722
1040
|
if job_id != record['controller_job_id']:
|
|
1041
|
+
if pool:
|
|
1042
|
+
command_to_run = 'sky jobs pool apply --pool'
|
|
1043
|
+
else:
|
|
1044
|
+
command_to_run = 'sky serve update'
|
|
723
1045
|
with ux_utils.print_exception_no_traceback():
|
|
724
1046
|
raise ValueError(
|
|
725
|
-
f'The
|
|
726
|
-
'Please specify a different name for your
|
|
727
|
-
'To update an existing
|
|
728
|
-
f'{service_name} <new-
|
|
1047
|
+
f'The {noun} {service_name!r} is already running. '
|
|
1048
|
+
f'Please specify a different name for your {noun}. '
|
|
1049
|
+
f'To update an existing {noun}, run: {command_to_run}'
|
|
1050
|
+
f' {service_name} <new-{noun}-yaml>')
|
|
729
1051
|
lb_port = record['load_balancer_port']
|
|
730
1052
|
if lb_port is not None:
|
|
731
1053
|
return message_utils.encode_payload(lb_port)
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
1054
|
+
else:
|
|
1055
|
+
controller_log_path = os.path.expanduser(
|
|
1056
|
+
generate_remote_controller_log_file_name(service_name))
|
|
1057
|
+
if os.path.exists(controller_log_path):
|
|
1058
|
+
with open(controller_log_path, 'r', encoding='utf-8') as f:
|
|
1059
|
+
log_content = f.read()
|
|
1060
|
+
if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
|
|
1061
|
+
in log_content):
|
|
1062
|
+
with ux_utils.print_exception_no_traceback():
|
|
1063
|
+
raise RuntimeError('Max number of services reached. '
|
|
1064
|
+
'To spin up more services, please '
|
|
1065
|
+
'tear down some existing services.')
|
|
737
1066
|
elapsed = time.time() - start_time
|
|
738
1067
|
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
|
|
739
1068
|
# Print the controller log to help user debug.
|
|
@@ -754,12 +1083,16 @@ def load_service_initialization_result(payload: str) -> int:
|
|
|
754
1083
|
return message_utils.decode_payload(payload)
|
|
755
1084
|
|
|
756
1085
|
|
|
757
|
-
def
|
|
758
|
-
|
|
1086
|
+
def _check_service_status_healthy(service_name: str,
|
|
1087
|
+
pool: bool) -> Optional[str]:
|
|
1088
|
+
service_record = _get_service_status(service_name,
|
|
1089
|
+
pool,
|
|
1090
|
+
with_replica_info=False)
|
|
1091
|
+
capnoun = 'Service' if not pool else 'Pool'
|
|
759
1092
|
if service_record is None:
|
|
760
|
-
return f'
|
|
1093
|
+
return f'{capnoun} {service_name!r} does not exist.'
|
|
761
1094
|
if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
|
|
762
|
-
return (f'
|
|
1095
|
+
return (f'{capnoun} {service_name!r} is still initializing its '
|
|
763
1096
|
'controller. Please try again later.')
|
|
764
1097
|
return None
|
|
765
1098
|
|
|
@@ -782,6 +1115,89 @@ def get_latest_version_with_min_replicas(
|
|
|
782
1115
|
return active_versions[-1] if active_versions else None
|
|
783
1116
|
|
|
784
1117
|
|
|
1118
|
+
def _process_line(
|
|
1119
|
+
line: str,
|
|
1120
|
+
cluster_name: str,
|
|
1121
|
+
stop_on_eof: bool = False,
|
|
1122
|
+
streamed_provision_log_paths: Optional[set] = None) -> Iterator[str]:
|
|
1123
|
+
# The line might be directing users to view logs, like
|
|
1124
|
+
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
1125
|
+
# We should tail the detailed logs for user.
|
|
1126
|
+
def cluster_is_up() -> bool:
|
|
1127
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
1128
|
+
return status == status_lib.ClusterStatus.UP
|
|
1129
|
+
|
|
1130
|
+
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1131
|
+
line)
|
|
1132
|
+
provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
|
|
1133
|
+
line)
|
|
1134
|
+
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1135
|
+
|
|
1136
|
+
def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
|
|
1137
|
+
# Check if this provision log has already been streamed to avoid
|
|
1138
|
+
# duplicate expansion. When a Kubernetes cluster needs to pull a Docker
|
|
1139
|
+
# image, rich spinner updates can produce hundreds of lines matching
|
|
1140
|
+
# _SKYPILOT_PROVISION_LOG_CMD_PATTERN (e.g., "Launching (1 pod(s)
|
|
1141
|
+
# pending due to Pulling)... View logs: sky logs --provision ...").
|
|
1142
|
+
# Without this check, the same provision log would be expanded hundreds
|
|
1143
|
+
# of times, creating huge log files (30M+) and making users think the
|
|
1144
|
+
# system is stuck in an infinite loop.
|
|
1145
|
+
if streamed_provision_log_paths is not None:
|
|
1146
|
+
resolved_path = str(p.resolve())
|
|
1147
|
+
if resolved_path in streamed_provision_log_paths:
|
|
1148
|
+
return
|
|
1149
|
+
streamed_provision_log_paths.add(resolved_path)
|
|
1150
|
+
|
|
1151
|
+
try:
|
|
1152
|
+
with open(p, 'r', newline='', encoding='utf-8') as f:
|
|
1153
|
+
# Exit if >10s without new content to avoid hanging when INIT
|
|
1154
|
+
yield from log_utils.follow_logs(f,
|
|
1155
|
+
should_stop=cluster_is_up,
|
|
1156
|
+
stop_on_eof=stop_on_eof,
|
|
1157
|
+
idle_timeout_seconds=10)
|
|
1158
|
+
except FileNotFoundError:
|
|
1159
|
+
# Fall back cleanly if the hinted path doesn't exist
|
|
1160
|
+
yield line
|
|
1161
|
+
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
1162
|
+
f'Try to expand log file {p} but not found. Skipping...'
|
|
1163
|
+
f'{colorama.Style.RESET_ALL}')
|
|
1164
|
+
return
|
|
1165
|
+
|
|
1166
|
+
if provision_api_log_prompt is not None:
|
|
1167
|
+
rel_path = provision_api_log_prompt.group(1)
|
|
1168
|
+
nested_log_path = pathlib.Path(
|
|
1169
|
+
skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
|
|
1170
|
+
rel_path).resolve()
|
|
1171
|
+
yield from _stream_provision_path(nested_log_path)
|
|
1172
|
+
return
|
|
1173
|
+
|
|
1174
|
+
if provision_log_cmd_prompt is not None:
|
|
1175
|
+
# Resolve provision log via cluster table first, then history.
|
|
1176
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1177
|
+
cluster_name)
|
|
1178
|
+
if not log_path_str:
|
|
1179
|
+
log_path_str = (
|
|
1180
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1181
|
+
cluster_name))
|
|
1182
|
+
if not log_path_str:
|
|
1183
|
+
yield line
|
|
1184
|
+
return
|
|
1185
|
+
yield from _stream_provision_path(
|
|
1186
|
+
pathlib.Path(log_path_str).expanduser().resolve())
|
|
1187
|
+
return
|
|
1188
|
+
|
|
1189
|
+
if log_prompt is not None:
|
|
1190
|
+
# Now we skip other logs (file sync logs) since we lack
|
|
1191
|
+
# utility to determine when these log files are finished
|
|
1192
|
+
# writing.
|
|
1193
|
+
# TODO(tian): We should not skip these logs since there are
|
|
1194
|
+
# small chance that error will happen in file sync. Need to
|
|
1195
|
+
# find a better way to do this.
|
|
1196
|
+
return
|
|
1197
|
+
|
|
1198
|
+
yield line
|
|
1199
|
+
|
|
1200
|
+
|
|
785
1201
|
def _follow_logs_with_provision_expanding(
|
|
786
1202
|
file: TextIO,
|
|
787
1203
|
cluster_name: str,
|
|
@@ -803,52 +1219,14 @@ def _follow_logs_with_provision_expanding(
|
|
|
803
1219
|
Yields:
|
|
804
1220
|
Log lines, including expanded content from referenced provision logs.
|
|
805
1221
|
"""
|
|
806
|
-
|
|
807
|
-
def cluster_is_up() -> bool:
|
|
808
|
-
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
|
809
|
-
if cluster_record is None:
|
|
810
|
-
return False
|
|
811
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1222
|
+
streamed_provision_log_paths: set = set()
|
|
812
1223
|
|
|
813
1224
|
def process_line(line: str) -> Iterator[str]:
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
if provision_log_prompt is not None:
|
|
821
|
-
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
|
822
|
-
|
|
823
|
-
try:
|
|
824
|
-
with open(nested_log_path, 'r', newline='',
|
|
825
|
-
encoding='utf-8') as f:
|
|
826
|
-
# We still exit if more than 10 seconds without new content
|
|
827
|
-
# to avoid any internal bug that causes the launch to fail
|
|
828
|
-
# while cluster status remains INIT.
|
|
829
|
-
yield from log_utils.follow_logs(f,
|
|
830
|
-
should_stop=cluster_is_up,
|
|
831
|
-
stop_on_eof=stop_on_eof,
|
|
832
|
-
idle_timeout_seconds=10)
|
|
833
|
-
except FileNotFoundError:
|
|
834
|
-
yield line
|
|
835
|
-
|
|
836
|
-
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
|
837
|
-
f'Try to expand log file {nested_log_path} but not '
|
|
838
|
-
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
|
839
|
-
pass
|
|
840
|
-
return
|
|
841
|
-
|
|
842
|
-
if log_prompt is not None:
|
|
843
|
-
# Now we skip other logs (file sync logs) since we lack
|
|
844
|
-
# utility to determine when these log files are finished
|
|
845
|
-
# writing.
|
|
846
|
-
# TODO(tian): We should not skip these logs since there are
|
|
847
|
-
# small chance that error will happen in file sync. Need to
|
|
848
|
-
# find a better way to do this.
|
|
849
|
-
return
|
|
850
|
-
|
|
851
|
-
yield line
|
|
1225
|
+
yield from _process_line(
|
|
1226
|
+
line,
|
|
1227
|
+
cluster_name,
|
|
1228
|
+
stop_on_eof=stop_on_eof,
|
|
1229
|
+
streamed_provision_log_paths=streamed_provision_log_paths)
|
|
852
1230
|
|
|
853
1231
|
return log_utils.follow_logs(file,
|
|
854
1232
|
should_stop=should_stop,
|
|
@@ -857,24 +1235,62 @@ def _follow_logs_with_provision_expanding(
|
|
|
857
1235
|
idle_timeout_seconds=idle_timeout_seconds)
|
|
858
1236
|
|
|
859
1237
|
|
|
860
|
-
def
|
|
861
|
-
|
|
862
|
-
|
|
1238
|
+
def _capped_follow_logs_with_provision_expanding(
|
|
1239
|
+
log_list: List[str],
|
|
1240
|
+
cluster_name: str,
|
|
1241
|
+
*,
|
|
1242
|
+
line_cap: int = 100,
|
|
1243
|
+
) -> Iterator[str]:
|
|
1244
|
+
"""Follows logs and expands any provision.log references found.
|
|
1245
|
+
|
|
1246
|
+
Args:
|
|
1247
|
+
log_list: List of Log Lines to read from.
|
|
1248
|
+
cluster_name: Name of the cluster being launched.
|
|
1249
|
+
line_cap: Number of last lines to return
|
|
1250
|
+
|
|
1251
|
+
Yields:
|
|
1252
|
+
Log lines, including expanded content from referenced provision logs.
|
|
1253
|
+
"""
|
|
1254
|
+
all_lines: Deque[str] = collections.deque(maxlen=line_cap)
|
|
1255
|
+
streamed_provision_log_paths: set = set()
|
|
1256
|
+
|
|
1257
|
+
for line in log_list:
|
|
1258
|
+
for processed in _process_line(
|
|
1259
|
+
line=line,
|
|
1260
|
+
cluster_name=cluster_name,
|
|
1261
|
+
stop_on_eof=False,
|
|
1262
|
+
streamed_provision_log_paths=streamed_provision_log_paths):
|
|
1263
|
+
all_lines.append(processed)
|
|
1264
|
+
|
|
1265
|
+
yield from all_lines
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
1269
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1270
|
+
msg = _check_service_status_healthy(service_name, pool=pool)
|
|
863
1271
|
if msg is not None:
|
|
864
1272
|
return msg
|
|
1273
|
+
repnoun = 'worker' if pool else 'replica'
|
|
1274
|
+
caprepnoun = repnoun.capitalize()
|
|
865
1275
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
|
|
866
|
-
f'of
|
|
867
|
-
|
|
1276
|
+
f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
|
|
868
1277
|
log_file_name = generate_replica_log_file_name(service_name, replica_id)
|
|
869
1278
|
if os.path.exists(log_file_name):
|
|
870
|
-
|
|
871
|
-
|
|
1279
|
+
if tail is not None:
|
|
1280
|
+
lines = common_utils.read_last_n_lines(log_file_name, tail)
|
|
1281
|
+
for line in lines:
|
|
1282
|
+
if not line.endswith('\n'):
|
|
1283
|
+
line += '\n'
|
|
1284
|
+
print(line, end='', flush=True)
|
|
1285
|
+
else:
|
|
1286
|
+
with open(log_file_name, 'r', encoding='utf-8') as f:
|
|
1287
|
+
print(f.read(), flush=True)
|
|
872
1288
|
return ''
|
|
873
1289
|
|
|
874
1290
|
launch_log_file_name = generate_replica_launch_log_file_name(
|
|
875
1291
|
service_name, replica_id)
|
|
876
1292
|
if not os.path.exists(launch_log_file_name):
|
|
877
|
-
return (f'{colorama.Fore.RED}
|
|
1293
|
+
return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
|
|
878
1294
|
f'{colorama.Style.RESET_ALL}')
|
|
879
1295
|
|
|
880
1296
|
replica_cluster_name = generate_replica_cluster_name(
|
|
@@ -891,42 +1307,89 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
|
891
1307
|
|
|
892
1308
|
replica_provisioned = (
|
|
893
1309
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
1310
|
+
|
|
1311
|
+
# Handle launch logs based on number parameter
|
|
1312
|
+
final_lines_to_print = []
|
|
1313
|
+
if tail is not None:
|
|
1314
|
+
static_lines = common_utils.read_last_n_lines(launch_log_file_name,
|
|
1315
|
+
tail)
|
|
1316
|
+
lines = list(
|
|
1317
|
+
_capped_follow_logs_with_provision_expanding(
|
|
1318
|
+
log_list=static_lines,
|
|
1319
|
+
cluster_name=replica_cluster_name,
|
|
1320
|
+
line_cap=tail,
|
|
1321
|
+
))
|
|
1322
|
+
final_lines_to_print += lines
|
|
1323
|
+
else:
|
|
1324
|
+
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
|
1325
|
+
for line in _follow_logs_with_provision_expanding(
|
|
1326
|
+
f,
|
|
1327
|
+
replica_cluster_name,
|
|
1328
|
+
should_stop=replica_provisioned,
|
|
1329
|
+
stop_on_eof=not follow,
|
|
1330
|
+
):
|
|
1331
|
+
print(line, end='', flush=True)
|
|
902
1332
|
|
|
903
1333
|
if (not follow and
|
|
904
1334
|
_get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
|
|
905
1335
|
# Early exit if not following the logs.
|
|
1336
|
+
if tail is not None:
|
|
1337
|
+
for line in final_lines_to_print:
|
|
1338
|
+
if not line.endswith('\n'):
|
|
1339
|
+
line += '\n'
|
|
1340
|
+
print(line, end='', flush=True)
|
|
906
1341
|
return ''
|
|
907
1342
|
|
|
908
1343
|
backend = backends.CloudVmRayBackend()
|
|
909
1344
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
910
1345
|
replica_cluster_name)
|
|
911
1346
|
if handle is None:
|
|
1347
|
+
if tail is not None:
|
|
1348
|
+
for line in final_lines_to_print:
|
|
1349
|
+
if not line.endswith('\n'):
|
|
1350
|
+
line += '\n'
|
|
1351
|
+
print(line, end='', flush=True)
|
|
912
1352
|
return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
|
|
913
1353
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
914
1354
|
|
|
915
1355
|
# Notify user here to make sure user won't think the log is finished.
|
|
916
1356
|
print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
|
|
917
|
-
f'of
|
|
1357
|
+
f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
|
|
918
1358
|
|
|
919
1359
|
# Always tail the latest logs, which represent user setup & run.
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1360
|
+
if tail is None:
|
|
1361
|
+
returncode = backend.tail_logs(handle, job_id=None, follow=follow)
|
|
1362
|
+
if returncode != 0:
|
|
1363
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
|
|
1364
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1365
|
+
elif not follow and tail > 0:
|
|
1366
|
+
final = backend.tail_logs(handle,
|
|
1367
|
+
job_id=None,
|
|
1368
|
+
follow=follow,
|
|
1369
|
+
tail=tail,
|
|
1370
|
+
stream_logs=False,
|
|
1371
|
+
require_outputs=True,
|
|
1372
|
+
process_stream=True)
|
|
1373
|
+
if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
|
|
1374
|
+
if tail is not None:
|
|
1375
|
+
for line in final_lines_to_print:
|
|
1376
|
+
if not line.endswith('\n'):
|
|
1377
|
+
line += '\n'
|
|
1378
|
+
print(line, end='', flush=True)
|
|
1379
|
+
return (f'{colorama.Fore.RED}Failed to stream logs for replica '
|
|
1380
|
+
f'{replica_id}.{colorama.Style.RESET_ALL}')
|
|
1381
|
+
final_lines_to_print += final[1].splitlines()
|
|
1382
|
+
for line in final_lines_to_print[-tail:]:
|
|
1383
|
+
if not line.endswith('\n'):
|
|
1384
|
+
line += '\n'
|
|
1385
|
+
print(line, end='', flush=True)
|
|
924
1386
|
return ''
|
|
925
1387
|
|
|
926
1388
|
|
|
927
1389
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
928
|
-
follow: bool
|
|
929
|
-
|
|
1390
|
+
follow: bool, tail: Optional[int],
|
|
1391
|
+
pool: bool) -> str:
|
|
1392
|
+
msg = _check_service_status_healthy(service_name, pool)
|
|
930
1393
|
if msg is not None:
|
|
931
1394
|
return msg
|
|
932
1395
|
if stream_controller:
|
|
@@ -935,19 +1398,31 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
|
935
1398
|
log_file = generate_remote_load_balancer_log_file_name(service_name)
|
|
936
1399
|
|
|
937
1400
|
def _service_is_terminal() -> bool:
|
|
938
|
-
record =
|
|
1401
|
+
record = _get_service_status(service_name,
|
|
1402
|
+
pool,
|
|
1403
|
+
with_replica_info=False)
|
|
939
1404
|
if record is None:
|
|
940
1405
|
return True
|
|
941
1406
|
return record['status'] in serve_state.ServiceStatus.failed_statuses()
|
|
942
1407
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
):
|
|
1408
|
+
if tail is not None:
|
|
1409
|
+
lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
|
|
1410
|
+
tail)
|
|
1411
|
+
for line in lines:
|
|
1412
|
+
if not line.endswith('\n'):
|
|
1413
|
+
line += '\n'
|
|
950
1414
|
print(line, end='', flush=True)
|
|
1415
|
+
else:
|
|
1416
|
+
with open(os.path.expanduser(log_file),
|
|
1417
|
+
'r',
|
|
1418
|
+
newline='',
|
|
1419
|
+
encoding='utf-8') as f:
|
|
1420
|
+
for line in log_utils.follow_logs(
|
|
1421
|
+
f,
|
|
1422
|
+
should_stop=_service_is_terminal,
|
|
1423
|
+
stop_on_eof=not follow,
|
|
1424
|
+
):
|
|
1425
|
+
print(line, end='', flush=True)
|
|
951
1426
|
return ''
|
|
952
1427
|
|
|
953
1428
|
|
|
@@ -965,18 +1440,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
|
|
|
965
1440
|
return f'{ready_replica_num}/{total_replica_num}'
|
|
966
1441
|
|
|
967
1442
|
|
|
968
|
-
def format_service_table(service_records: List[Dict[str, Any]],
|
|
969
|
-
|
|
1443
|
+
def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
|
|
1444
|
+
pool: bool) -> str:
|
|
1445
|
+
noun = 'pool' if pool else 'service'
|
|
970
1446
|
if not service_records:
|
|
971
|
-
return 'No existing
|
|
1447
|
+
return f'No existing {noun}s.'
|
|
972
1448
|
|
|
973
1449
|
service_columns = [
|
|
974
|
-
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1450
|
+
'NAME', 'VERSION', 'UPTIME', 'STATUS',
|
|
1451
|
+
'REPLICAS' if not pool else 'WORKERS'
|
|
975
1452
|
]
|
|
1453
|
+
if not pool:
|
|
1454
|
+
service_columns.append('ENDPOINT')
|
|
976
1455
|
if show_all:
|
|
977
1456
|
service_columns.extend([
|
|
978
1457
|
'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
|
|
979
1458
|
])
|
|
1459
|
+
if pool:
|
|
1460
|
+
# Remove the load balancing policy column for pools.
|
|
1461
|
+
service_columns.pop(-2)
|
|
980
1462
|
service_table = log_utils.create_table(service_columns)
|
|
981
1463
|
|
|
982
1464
|
replica_infos: List[Dict[str, Any]] = []
|
|
@@ -1007,37 +1489,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
|
1007
1489
|
uptime,
|
|
1008
1490
|
status_str,
|
|
1009
1491
|
replicas,
|
|
1010
|
-
endpoint,
|
|
1011
1492
|
]
|
|
1493
|
+
if not pool:
|
|
1494
|
+
service_values.append(endpoint)
|
|
1012
1495
|
if show_all:
|
|
1013
1496
|
service_values.extend(
|
|
1014
1497
|
[policy, load_balancing_policy, requested_resources_str])
|
|
1498
|
+
if pool:
|
|
1499
|
+
service_values.pop(-2)
|
|
1015
1500
|
service_table.add_row(service_values)
|
|
1016
1501
|
|
|
1017
|
-
replica_table = _format_replica_table(replica_infos, show_all)
|
|
1502
|
+
replica_table = _format_replica_table(replica_infos, show_all, pool)
|
|
1503
|
+
replica_noun = 'Pool Workers' if pool else 'Service Replicas'
|
|
1018
1504
|
return (f'{service_table}\n'
|
|
1019
1505
|
f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1020
|
-
f'
|
|
1506
|
+
f'{replica_noun}{colorama.Style.RESET_ALL}\n'
|
|
1021
1507
|
f'{replica_table}')
|
|
1022
1508
|
|
|
1023
1509
|
|
|
1024
|
-
def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1025
|
-
|
|
1510
|
+
def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
1511
|
+
pool: bool) -> str:
|
|
1512
|
+
noun = 'worker' if pool else 'replica'
|
|
1026
1513
|
if not replica_records:
|
|
1027
|
-
return 'No existing
|
|
1514
|
+
return f'No existing {noun}s.'
|
|
1028
1515
|
|
|
1029
1516
|
replica_columns = [
|
|
1030
|
-
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1031
|
-
'
|
|
1517
|
+
'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
|
|
1518
|
+
'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
|
|
1032
1519
|
]
|
|
1033
|
-
if
|
|
1034
|
-
replica_columns.append('
|
|
1520
|
+
if pool:
|
|
1521
|
+
replica_columns.append('USED_BY')
|
|
1522
|
+
# Remove the endpoint column for pool workers.
|
|
1523
|
+
replica_columns.pop(3)
|
|
1035
1524
|
replica_table = log_utils.create_table(replica_columns)
|
|
1036
1525
|
|
|
1037
1526
|
truncate_hint = ''
|
|
1038
1527
|
if not show_all:
|
|
1039
1528
|
if len(replica_records) > _REPLICA_TRUNC_NUM:
|
|
1040
|
-
truncate_hint = '\n... (use --all to show all
|
|
1529
|
+
truncate_hint = f'\n... (use --all to show all {noun}s)'
|
|
1041
1530
|
replica_records = replica_records[:_REPLICA_TRUNC_NUM]
|
|
1042
1531
|
|
|
1043
1532
|
for record in replica_records:
|
|
@@ -1047,21 +1536,26 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1047
1536
|
version = (record['version'] if 'version' in record else '-')
|
|
1048
1537
|
replica_endpoint = endpoint if endpoint else '-'
|
|
1049
1538
|
launched_at = log_utils.readable_time_duration(record['launched_at'])
|
|
1539
|
+
infra = '-'
|
|
1050
1540
|
resources_str = '-'
|
|
1051
1541
|
replica_status = record['status']
|
|
1052
1542
|
status_str = replica_status.colored_str()
|
|
1053
|
-
|
|
1054
|
-
|
|
1543
|
+
used_by = record.get('used_by', None)
|
|
1544
|
+
used_by_str = str(used_by) if used_by is not None else '-'
|
|
1055
1545
|
|
|
1056
1546
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
|
1057
1547
|
'handle']
|
|
1058
1548
|
if replica_handle is not None:
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1549
|
+
infra = replica_handle.launched_resources.infra.formatted_str()
|
|
1550
|
+
simplified = not show_all
|
|
1551
|
+
resources_str_simple, resources_str_full = (
|
|
1552
|
+
resources_utils.get_readable_resources_repr(
|
|
1553
|
+
replica_handle, simplified_only=simplified))
|
|
1554
|
+
if simplified:
|
|
1555
|
+
resources_str = resources_str_simple
|
|
1556
|
+
else:
|
|
1557
|
+
assert resources_str_full is not None
|
|
1558
|
+
resources_str = resources_str_full
|
|
1065
1559
|
|
|
1066
1560
|
replica_values = [
|
|
1067
1561
|
service_name,
|
|
@@ -1069,18 +1563,20 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
|
1069
1563
|
version,
|
|
1070
1564
|
replica_endpoint,
|
|
1071
1565
|
launched_at,
|
|
1566
|
+
infra,
|
|
1072
1567
|
resources_str,
|
|
1073
1568
|
status_str,
|
|
1074
|
-
region,
|
|
1075
1569
|
]
|
|
1076
|
-
if
|
|
1077
|
-
replica_values.append(
|
|
1570
|
+
if pool:
|
|
1571
|
+
replica_values.append(used_by_str)
|
|
1572
|
+
replica_values.pop(3)
|
|
1078
1573
|
replica_table.add_row(replica_values)
|
|
1079
1574
|
|
|
1080
1575
|
return f'{replica_table}{truncate_hint}'
|
|
1081
1576
|
|
|
1082
1577
|
|
|
1083
1578
|
# =========================== CodeGen for Sky Serve ===========================
|
|
1579
|
+
# TODO (kyuds): deprecate and remove serve codegen entirely.
|
|
1084
1580
|
|
|
1085
1581
|
|
|
1086
1582
|
# TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
|
|
@@ -1099,13 +1595,16 @@ class ServeCodeGen:
|
|
|
1099
1595
|
'from sky.serve import serve_state',
|
|
1100
1596
|
'from sky.serve import serve_utils',
|
|
1101
1597
|
'from sky.serve import constants',
|
|
1598
|
+
'serve_version = constants.SERVE_VERSION',
|
|
1102
1599
|
]
|
|
1103
1600
|
|
|
1104
1601
|
@classmethod
|
|
1105
|
-
def get_service_status(cls, service_names: Optional[List[str]]
|
|
1602
|
+
def get_service_status(cls, service_names: Optional[List[str]],
|
|
1603
|
+
pool: bool) -> str:
|
|
1106
1604
|
code = [
|
|
1107
|
-
f'
|
|
1108
|
-
'
|
|
1605
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1606
|
+
f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
|
|
1607
|
+
'**kwargs)', 'print(msg, end="", flush=True)'
|
|
1109
1608
|
]
|
|
1110
1609
|
return cls._build(code)
|
|
1111
1610
|
|
|
@@ -1118,11 +1617,12 @@ class ServeCodeGen:
|
|
|
1118
1617
|
return cls._build(code)
|
|
1119
1618
|
|
|
1120
1619
|
@classmethod
|
|
1121
|
-
def terminate_services(cls, service_names: Optional[List[str]],
|
|
1122
|
-
|
|
1620
|
+
def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
|
|
1621
|
+
pool: bool) -> str:
|
|
1123
1622
|
code = [
|
|
1623
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1124
1624
|
f'msg = serve_utils.terminate_services({service_names!r}, '
|
|
1125
|
-
f'purge={purge})', 'print(msg, end="", flush=True)'
|
|
1625
|
+
f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
|
|
1126
1626
|
]
|
|
1127
1627
|
return cls._build(code)
|
|
1128
1628
|
|
|
@@ -1139,29 +1639,48 @@ class ServeCodeGen:
|
|
|
1139
1639
|
return cls._build(code)
|
|
1140
1640
|
|
|
1141
1641
|
@classmethod
|
|
1142
|
-
def wait_service_registration(cls, service_name: str, job_id: int
|
|
1642
|
+
def wait_service_registration(cls, service_name: str, job_id: int,
|
|
1643
|
+
pool: bool) -> str:
|
|
1143
1644
|
code = [
|
|
1645
|
+
f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
|
|
1144
1646
|
'msg = serve_utils.wait_service_registration('
|
|
1145
|
-
f'{service_name!r}, {job_id}
|
|
1647
|
+
f'{service_name!r}, {job_id}, **kwargs)',
|
|
1648
|
+
'print(msg, end="", flush=True)'
|
|
1146
1649
|
]
|
|
1147
1650
|
return cls._build(code)
|
|
1148
1651
|
|
|
1149
1652
|
@classmethod
|
|
1150
1653
|
def stream_replica_logs(cls, service_name: str, replica_id: int,
|
|
1151
|
-
follow: bool
|
|
1654
|
+
follow: bool, tail: Optional[int],
|
|
1655
|
+
pool: bool) -> str:
|
|
1152
1656
|
code = [
|
|
1657
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1153
1658
|
'msg = serve_utils.stream_replica_logs('
|
|
1154
|
-
f'{service_name!r}, {replica_id!r}, follow={follow}
|
|
1155
|
-
'print(msg, flush=True)'
|
|
1659
|
+
f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
|
|
1660
|
+
'**kwargs)', 'print(msg, flush=True)'
|
|
1156
1661
|
]
|
|
1157
1662
|
return cls._build(code)
|
|
1158
1663
|
|
|
1159
1664
|
@classmethod
|
|
1160
1665
|
def stream_serve_process_logs(cls, service_name: str,
|
|
1161
|
-
stream_controller: bool, follow: bool
|
|
1666
|
+
stream_controller: bool, follow: bool,
|
|
1667
|
+
tail: Optional[int], pool: bool) -> str:
|
|
1162
1668
|
code = [
|
|
1669
|
+
f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
|
|
1163
1670
|
f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
|
|
1164
|
-
f'{stream_controller}, follow={follow}
|
|
1671
|
+
f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
|
|
1672
|
+
'print(msg, flush=True)'
|
|
1673
|
+
]
|
|
1674
|
+
return cls._build(code)
|
|
1675
|
+
|
|
1676
|
+
@classmethod
|
|
1677
|
+
def update_service(cls, service_name: str, version: int, mode: str,
|
|
1678
|
+
pool: bool) -> str:
|
|
1679
|
+
code = [
|
|
1680
|
+
f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
|
|
1681
|
+
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1682
|
+
f'{version}, mode={mode!r}, **kwargs)',
|
|
1683
|
+
'print(msg, end="", flush=True)',
|
|
1165
1684
|
]
|
|
1166
1685
|
return cls._build(code)
|
|
1167
1686
|
|
|
@@ -1175,12 +1694,3 @@ class ServeCodeGen:
|
|
|
1175
1694
|
f'"{common_utils.get_user_hash()}"; '
|
|
1176
1695
|
f'{skylet_constants.SKY_PYTHON_CMD} '
|
|
1177
1696
|
f'-u -c {shlex.quote(generated_code)}')
|
|
1178
|
-
|
|
1179
|
-
@classmethod
|
|
1180
|
-
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
|
1181
|
-
code = [
|
|
1182
|
-
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
|
1183
|
-
f'{version}, mode={mode!r})',
|
|
1184
|
-
'print(msg, end="", flush=True)',
|
|
1185
|
-
]
|
|
1186
|
-
return cls._build(code)
|