skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/server/core.py
CHANGED
|
@@ -1,107 +1,27 @@
|
|
|
1
1
|
"""SkyServe core APIs."""
|
|
2
|
-
import pathlib
|
|
3
|
-
import re
|
|
4
|
-
import signal
|
|
5
|
-
import tempfile
|
|
6
|
-
import threading
|
|
7
2
|
import typing
|
|
8
|
-
from typing import Any, Dict, List, Optional,
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
4
|
|
|
10
|
-
import colorama
|
|
11
|
-
|
|
12
|
-
import sky
|
|
13
5
|
from sky import backends
|
|
14
6
|
from sky import exceptions
|
|
15
|
-
from sky import execution
|
|
16
7
|
from sky import sky_logging
|
|
17
|
-
from sky import
|
|
8
|
+
from sky.adaptors import common as adaptors_common
|
|
18
9
|
from sky.backends import backend_utils
|
|
19
|
-
from sky.
|
|
20
|
-
from sky.serve import constants as serve_constants
|
|
21
|
-
from sky.serve import serve_state
|
|
10
|
+
from sky.serve import serve_rpc_utils
|
|
22
11
|
from sky.serve import serve_utils
|
|
23
|
-
from sky.
|
|
12
|
+
from sky.serve.server import impl
|
|
24
13
|
from sky.usage import usage_lib
|
|
25
|
-
from sky.utils import admin_policy_utils
|
|
26
|
-
from sky.utils import command_runner
|
|
27
|
-
from sky.utils import common
|
|
28
|
-
from sky.utils import common_utils
|
|
29
14
|
from sky.utils import controller_utils
|
|
30
|
-
from sky.utils import rich_utils
|
|
31
15
|
from sky.utils import subprocess_utils
|
|
32
|
-
from sky.utils import ux_utils
|
|
33
16
|
|
|
34
17
|
if typing.TYPE_CHECKING:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
logger = sky_logging.init_logger(__name__)
|
|
18
|
+
import grpc
|
|
38
19
|
|
|
20
|
+
import sky
|
|
21
|
+
else:
|
|
22
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
39
23
|
|
|
40
|
-
|
|
41
|
-
service_name: str, task: 'sky.Task') -> Dict[str, Any]:
|
|
42
|
-
"""Rewrite the paths of TLS credentials in the task.
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
service_name: Name of the service.
|
|
46
|
-
task: sky.Task to rewrite.
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
The generated template variables for TLS.
|
|
50
|
-
"""
|
|
51
|
-
service_spec = task.service
|
|
52
|
-
# Already checked by validate_service_task
|
|
53
|
-
assert service_spec is not None
|
|
54
|
-
if service_spec.tls_credential is None:
|
|
55
|
-
return {'use_tls': False}
|
|
56
|
-
remote_tls_keyfile = (
|
|
57
|
-
serve_utils.generate_remote_tls_keyfile_name(service_name))
|
|
58
|
-
remote_tls_certfile = (
|
|
59
|
-
serve_utils.generate_remote_tls_certfile_name(service_name))
|
|
60
|
-
tls_template_vars = {
|
|
61
|
-
'use_tls': True,
|
|
62
|
-
'remote_tls_keyfile': remote_tls_keyfile,
|
|
63
|
-
'remote_tls_certfile': remote_tls_certfile,
|
|
64
|
-
'local_tls_keyfile': service_spec.tls_credential.keyfile,
|
|
65
|
-
'local_tls_certfile': service_spec.tls_credential.certfile,
|
|
66
|
-
}
|
|
67
|
-
service_spec.tls_credential = serve_utils.TLSCredential(
|
|
68
|
-
remote_tls_keyfile, remote_tls_certfile)
|
|
69
|
-
return tls_template_vars
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _get_all_replica_targets(
|
|
73
|
-
service_name: str, backend: backends.CloudVmRayBackend,
|
|
74
|
-
handle: backends.CloudVmRayResourceHandle
|
|
75
|
-
) -> Set[serve_utils.ServiceComponentTarget]:
|
|
76
|
-
"""Helper function to get targets for all live replicas."""
|
|
77
|
-
code = serve_utils.ServeCodeGen.get_service_status([service_name])
|
|
78
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
79
|
-
handle,
|
|
80
|
-
code,
|
|
81
|
-
require_outputs=True,
|
|
82
|
-
stream_logs=False,
|
|
83
|
-
separate_stderr=True)
|
|
84
|
-
|
|
85
|
-
try:
|
|
86
|
-
subprocess_utils.handle_returncode(returncode,
|
|
87
|
-
code,
|
|
88
|
-
'Failed to fetch services',
|
|
89
|
-
stderr,
|
|
90
|
-
stream_logs=True)
|
|
91
|
-
except exceptions.CommandError as e:
|
|
92
|
-
raise RuntimeError(e.error_msg) from e
|
|
93
|
-
|
|
94
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
95
|
-
if not service_records:
|
|
96
|
-
raise ValueError(f'Service {service_name!r} not found.')
|
|
97
|
-
assert len(service_records) == 1
|
|
98
|
-
service_record = service_records[0]
|
|
99
|
-
|
|
100
|
-
return {
|
|
101
|
-
serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
|
|
102
|
-
replica_info['replica_id'])
|
|
103
|
-
for replica_info in service_record['replica_info']
|
|
104
|
-
}
|
|
24
|
+
logger = sky_logging.init_logger(__name__)
|
|
105
25
|
|
|
106
26
|
|
|
107
27
|
@usage_lib.entrypoint
|
|
@@ -122,381 +42,27 @@ def up(
|
|
|
122
42
|
argument.
|
|
123
43
|
endpoint: str; The service endpoint.
|
|
124
44
|
"""
|
|
125
|
-
|
|
126
|
-
if service_name is None:
|
|
127
|
-
service_name = serve_utils.generate_service_name()
|
|
128
|
-
|
|
129
|
-
# The service name will be used as:
|
|
130
|
-
# 1. controller cluster name: 'sky-serve-controller-<service_name>'
|
|
131
|
-
# 2. replica cluster name: '<service_name>-<replica_id>'
|
|
132
|
-
# In both cases, service name shares the same regex with cluster name.
|
|
133
|
-
if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None:
|
|
134
|
-
with ux_utils.print_exception_no_traceback():
|
|
135
|
-
raise ValueError(f'Service name {service_name!r} is invalid: '
|
|
136
|
-
f'ensure it is fully matched by regex (e.g., '
|
|
137
|
-
'only contains lower letters, numbers and dash): '
|
|
138
|
-
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
|
139
|
-
|
|
140
|
-
serve_utils.validate_service_task(task)
|
|
141
|
-
# Always apply the policy again here, even though it might have been applied
|
|
142
|
-
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
143
|
-
# and get the mutated config.
|
|
144
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
|
145
|
-
task, use_mutated_config_in_current_request=False)
|
|
146
|
-
task = dag.tasks[0]
|
|
147
|
-
|
|
148
|
-
with rich_utils.safe_status(
|
|
149
|
-
ux_utils.spinner_message('Initializing service')):
|
|
150
|
-
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
151
|
-
task, task_type='serve')
|
|
152
|
-
|
|
153
|
-
tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
154
|
-
service_name, task)
|
|
155
|
-
|
|
156
|
-
with tempfile.NamedTemporaryFile(
|
|
157
|
-
prefix=f'service-task-{service_name}-',
|
|
158
|
-
mode='w',
|
|
159
|
-
) as service_file, tempfile.NamedTemporaryFile(
|
|
160
|
-
prefix=f'controller-task-{service_name}-',
|
|
161
|
-
mode='w',
|
|
162
|
-
) as controller_file:
|
|
163
|
-
controller_name = common.SKY_SERVE_CONTROLLER_NAME
|
|
164
|
-
task_config = task.to_yaml_config()
|
|
165
|
-
common_utils.dump_yaml(service_file.name, task_config)
|
|
166
|
-
remote_tmp_task_yaml_path = (
|
|
167
|
-
serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
|
|
168
|
-
remote_config_yaml_path = (
|
|
169
|
-
serve_utils.generate_remote_config_yaml_file_name(service_name))
|
|
170
|
-
controller_log_file = (
|
|
171
|
-
serve_utils.generate_remote_controller_log_file_name(service_name))
|
|
172
|
-
controller_resources = controller_utils.get_controller_resources(
|
|
173
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
|
174
|
-
task_resources=task.resources)
|
|
175
|
-
|
|
176
|
-
vars_to_fill = {
|
|
177
|
-
'remote_task_yaml_path': remote_tmp_task_yaml_path,
|
|
178
|
-
'local_task_yaml_path': service_file.name,
|
|
179
|
-
'service_name': service_name,
|
|
180
|
-
'controller_log_file': controller_log_file,
|
|
181
|
-
'remote_user_config_path': remote_config_yaml_path,
|
|
182
|
-
'modified_catalogs':
|
|
183
|
-
service_catalog_common.get_modified_catalog_file_mounts(),
|
|
184
|
-
**tls_template_vars,
|
|
185
|
-
**controller_utils.shared_controller_vars_to_fill(
|
|
186
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
|
187
|
-
remote_user_config_path=remote_config_yaml_path,
|
|
188
|
-
local_user_config=mutated_user_config,
|
|
189
|
-
),
|
|
190
|
-
}
|
|
191
|
-
common_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE,
|
|
192
|
-
vars_to_fill,
|
|
193
|
-
output_path=controller_file.name)
|
|
194
|
-
controller_task = task_lib.Task.from_yaml(controller_file.name)
|
|
195
|
-
# TODO(tian): Probably run another sky.launch after we get the load
|
|
196
|
-
# balancer port from the controller? So we don't need to open so many
|
|
197
|
-
# ports here. Or, we should have a nginx traffic control to refuse
|
|
198
|
-
# any connection to the unregistered ports.
|
|
199
|
-
controller_resources = {
|
|
200
|
-
r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
|
|
201
|
-
for r in controller_resources
|
|
202
|
-
}
|
|
203
|
-
controller_task.set_resources(controller_resources)
|
|
204
|
-
|
|
205
|
-
# # Set service_name so the backend will know to modify default ray
|
|
206
|
-
# task CPU usage to custom value instead of default 0.5 vCPU. We need
|
|
207
|
-
# to set it to a smaller value to support a larger number of services.
|
|
208
|
-
controller_task.service_name = service_name
|
|
209
|
-
|
|
210
|
-
print(f'{colorama.Fore.YELLOW}Launching controller for '
|
|
211
|
-
f'{service_name!r}...{colorama.Style.RESET_ALL}')
|
|
212
|
-
# We directly submit the request to the controller and let the
|
|
213
|
-
# controller to check name conflict. Suppose we have multiple
|
|
214
|
-
# sky.serve.up() with same service name, the first one will
|
|
215
|
-
# successfully write its job id to controller service database;
|
|
216
|
-
# and for all following sky.serve.up(), the controller will throw
|
|
217
|
-
# an exception (name conflict detected) and exit. Therefore the
|
|
218
|
-
# controller job id in database could be use as an indicator of
|
|
219
|
-
# whether the service is already running. If the id is the same
|
|
220
|
-
# with the current job id, we know the service is up and running
|
|
221
|
-
# for the first time; otherwise it is a name conflict.
|
|
222
|
-
controller_idle_minutes_to_autostop, controller_down = (
|
|
223
|
-
controller_utils.get_controller_autostop_config(
|
|
224
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
|
|
225
|
-
# Since the controller may be shared among multiple users, launch the
|
|
226
|
-
# controller with the API server's user hash.
|
|
227
|
-
with common.with_server_user_hash():
|
|
228
|
-
controller_job_id, controller_handle = execution.launch(
|
|
229
|
-
task=controller_task,
|
|
230
|
-
cluster_name=controller_name,
|
|
231
|
-
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
|
232
|
-
down=controller_down,
|
|
233
|
-
retry_until_up=True,
|
|
234
|
-
_disable_controller_check=True,
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
style = colorama.Style
|
|
238
|
-
fore = colorama.Fore
|
|
239
|
-
|
|
240
|
-
assert controller_job_id is not None and controller_handle is not None
|
|
241
|
-
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
|
|
242
|
-
# change after the first time, so there is no consistency issue.
|
|
243
|
-
with rich_utils.safe_status(
|
|
244
|
-
ux_utils.spinner_message(
|
|
245
|
-
'Waiting for the service to register')):
|
|
246
|
-
# This function will check the controller job id in the database
|
|
247
|
-
# and return the endpoint if the job id matches. Otherwise it will
|
|
248
|
-
# return None.
|
|
249
|
-
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
250
|
-
service_name, controller_job_id)
|
|
251
|
-
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
252
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
253
|
-
assert isinstance(controller_handle,
|
|
254
|
-
backends.CloudVmRayResourceHandle)
|
|
255
|
-
returncode, lb_port_payload, _ = backend.run_on_head(
|
|
256
|
-
controller_handle,
|
|
257
|
-
code,
|
|
258
|
-
require_outputs=True,
|
|
259
|
-
stream_logs=False)
|
|
260
|
-
try:
|
|
261
|
-
subprocess_utils.handle_returncode(
|
|
262
|
-
returncode, code, 'Failed to wait for service initialization',
|
|
263
|
-
lb_port_payload)
|
|
264
|
-
except exceptions.CommandError:
|
|
265
|
-
statuses = backend.get_job_status(controller_handle,
|
|
266
|
-
[controller_job_id],
|
|
267
|
-
stream_logs=False)
|
|
268
|
-
controller_job_status = list(statuses.values())[0]
|
|
269
|
-
if controller_job_status == sky.JobStatus.PENDING:
|
|
270
|
-
# Max number of services reached due to vCPU constraint.
|
|
271
|
-
# The controller job is pending due to ray job scheduling.
|
|
272
|
-
# We manually cancel the job here.
|
|
273
|
-
backend.cancel_jobs(controller_handle, [controller_job_id])
|
|
274
|
-
with ux_utils.print_exception_no_traceback():
|
|
275
|
-
raise RuntimeError(
|
|
276
|
-
'Max number of services reached. '
|
|
277
|
-
'To spin up more services, please '
|
|
278
|
-
'tear down some existing services.') from None
|
|
279
|
-
else:
|
|
280
|
-
# Possible cases:
|
|
281
|
-
# (1) name conflict;
|
|
282
|
-
# (2) max number of services reached due to memory
|
|
283
|
-
# constraint. The job will successfully run on the
|
|
284
|
-
# controller, but there will be an error thrown due
|
|
285
|
-
# to memory constraint check in the controller.
|
|
286
|
-
# See sky/serve/service.py for more details.
|
|
287
|
-
with ux_utils.print_exception_no_traceback():
|
|
288
|
-
raise RuntimeError(
|
|
289
|
-
'Failed to spin up the service. Please '
|
|
290
|
-
'check the logs above for more details.') from None
|
|
291
|
-
else:
|
|
292
|
-
lb_port = serve_utils.load_service_initialization_result(
|
|
293
|
-
lb_port_payload)
|
|
294
|
-
socket_endpoint = backend_utils.get_endpoints(
|
|
295
|
-
controller_handle.cluster_name, lb_port,
|
|
296
|
-
skip_status_check=True).get(lb_port)
|
|
297
|
-
assert socket_endpoint is not None, (
|
|
298
|
-
'Did not get endpoint for controller.')
|
|
299
|
-
# Already checked by validate_service_task
|
|
300
|
-
assert task.service is not None
|
|
301
|
-
protocol = ('http'
|
|
302
|
-
if task.service.tls_credential is None else 'https')
|
|
303
|
-
endpoint = f'{protocol}://{socket_endpoint}'
|
|
304
|
-
|
|
305
|
-
logger.info(
|
|
306
|
-
f'{fore.CYAN}Service name: '
|
|
307
|
-
f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
|
|
308
|
-
f'\n{fore.CYAN}Endpoint URL: '
|
|
309
|
-
f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
|
|
310
|
-
f'\n📋 Useful Commands'
|
|
311
|
-
f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
|
|
312
|
-
f'{ux_utils.BOLD}sky serve status {service_name} '
|
|
313
|
-
f'[--endpoint]{ux_utils.RESET_BOLD}'
|
|
314
|
-
f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
|
|
315
|
-
f'{ux_utils.BOLD}sky serve down {service_name}'
|
|
316
|
-
f'{ux_utils.RESET_BOLD}'
|
|
317
|
-
f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
|
|
318
|
-
f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
|
|
319
|
-
f'{ux_utils.RESET_BOLD}'
|
|
320
|
-
f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
|
|
321
|
-
f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
|
|
322
|
-
f'{ux_utils.RESET_BOLD}'
|
|
323
|
-
f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
|
|
324
|
-
f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
|
|
325
|
-
f'{ux_utils.RESET_BOLD}'
|
|
326
|
-
f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
|
|
327
|
-
f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
|
|
328
|
-
f'{ux_utils.RESET_BOLD}'
|
|
329
|
-
f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
|
|
330
|
-
f'{ux_utils.BOLD}curl {endpoint}'
|
|
331
|
-
f'{ux_utils.RESET_BOLD}'
|
|
332
|
-
'\n\n' +
|
|
333
|
-
ux_utils.finishing_message('Service is spinning up and replicas '
|
|
334
|
-
'will be ready shortly.'))
|
|
335
|
-
return service_name, endpoint
|
|
45
|
+
return impl.up(task, service_name, pool=False)
|
|
336
46
|
|
|
337
47
|
|
|
338
48
|
@usage_lib.entrypoint
|
|
339
|
-
def update(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
49
|
+
def update(task: Optional['sky.Task'],
|
|
50
|
+
service_name: str,
|
|
51
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
52
|
+
workers: Optional[int] = None) -> None:
|
|
343
53
|
"""Updates an existing service.
|
|
344
54
|
|
|
345
55
|
Please refer to the sky.cli.serve_update for the document.
|
|
346
56
|
|
|
347
57
|
Args:
|
|
348
|
-
task: sky.Task to update
|
|
58
|
+
task: sky.Task to update, or None if updating
|
|
59
|
+
the number of workers/replicas.
|
|
349
60
|
service_name: Name of the service.
|
|
350
61
|
mode: Update mode.
|
|
62
|
+
workers: Number of workers/replicas to set for the service when
|
|
63
|
+
task is None.
|
|
351
64
|
"""
|
|
352
|
-
|
|
353
|
-
serve_utils.validate_service_task(task)
|
|
354
|
-
|
|
355
|
-
# Always apply the policy again here, even though it might have been applied
|
|
356
|
-
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
357
|
-
# and get the mutated config.
|
|
358
|
-
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
359
|
-
# will not apply the config.
|
|
360
|
-
dag, _ = admin_policy_utils.apply(
|
|
361
|
-
task, use_mutated_config_in_current_request=False)
|
|
362
|
-
task = dag.tasks[0]
|
|
363
|
-
|
|
364
|
-
assert task.service is not None
|
|
365
|
-
if task.service.tls_credential is not None:
|
|
366
|
-
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
367
|
-
'Any updates to the keyfile and certfile will not take '
|
|
368
|
-
'effect. To update TLS keyfile and certfile, please '
|
|
369
|
-
'tear down the service and spin up a new one.')
|
|
370
|
-
|
|
371
|
-
handle = backend_utils.is_controller_accessible(
|
|
372
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
|
373
|
-
stopped_message=
|
|
374
|
-
'Service controller is stopped. There is no service to update. '
|
|
375
|
-
f'To spin up a new service, use {ux_utils.BOLD}'
|
|
376
|
-
f'sky serve up{ux_utils.RESET_BOLD}',
|
|
377
|
-
non_existent_message='Service does not exist. '
|
|
378
|
-
'To spin up a new service, '
|
|
379
|
-
f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
383
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
384
|
-
|
|
385
|
-
code = serve_utils.ServeCodeGen.get_service_status([service_name])
|
|
386
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
387
|
-
handle,
|
|
388
|
-
code,
|
|
389
|
-
require_outputs=True,
|
|
390
|
-
stream_logs=False,
|
|
391
|
-
separate_stderr=True)
|
|
392
|
-
try:
|
|
393
|
-
subprocess_utils.handle_returncode(returncode,
|
|
394
|
-
code, 'Failed to get service status '
|
|
395
|
-
'when update service',
|
|
396
|
-
stderr,
|
|
397
|
-
stream_logs=True)
|
|
398
|
-
except exceptions.CommandError as e:
|
|
399
|
-
raise RuntimeError(e.error_msg) from e
|
|
400
|
-
|
|
401
|
-
service_statuses = serve_utils.load_service_status(serve_status_payload)
|
|
402
|
-
if not service_statuses:
|
|
403
|
-
with ux_utils.print_exception_no_traceback():
|
|
404
|
-
raise RuntimeError(f'Cannot find service {service_name!r}.'
|
|
405
|
-
f'To spin up a service, use {ux_utils.BOLD}'
|
|
406
|
-
f'sky serve up{ux_utils.RESET_BOLD}')
|
|
407
|
-
|
|
408
|
-
if len(service_statuses) > 1:
|
|
409
|
-
with ux_utils.print_exception_no_traceback():
|
|
410
|
-
raise RuntimeError(
|
|
411
|
-
f'Multiple services found for {service_name!r}. ')
|
|
412
|
-
service_record = service_statuses[0]
|
|
413
|
-
prompt = None
|
|
414
|
-
if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
|
|
415
|
-
):
|
|
416
|
-
prompt = (f'Service {service_name!r} has a failed controller. '
|
|
417
|
-
'Please clean up the service and try again.')
|
|
418
|
-
elif (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT
|
|
419
|
-
):
|
|
420
|
-
prompt = (f'Service {service_name!r} is still initializing '
|
|
421
|
-
'its controller. Please try again later.')
|
|
422
|
-
if prompt is not None:
|
|
423
|
-
with ux_utils.print_exception_no_traceback():
|
|
424
|
-
raise RuntimeError(prompt)
|
|
425
|
-
|
|
426
|
-
original_lb_policy = service_record['load_balancing_policy']
|
|
427
|
-
assert task.service is not None, 'Service section not found.'
|
|
428
|
-
if original_lb_policy != task.service.load_balancing_policy:
|
|
429
|
-
logger.warning(
|
|
430
|
-
f'{colorama.Fore.YELLOW}Current load balancing policy '
|
|
431
|
-
f'{original_lb_policy!r} is different from the new policy '
|
|
432
|
-
f'{task.service.load_balancing_policy!r}. Updating the load '
|
|
433
|
-
'balancing policy is not supported yet and it will be ignored. '
|
|
434
|
-
'The service will continue to use the current load balancing '
|
|
435
|
-
f'policy.{colorama.Style.RESET_ALL}')
|
|
436
|
-
|
|
437
|
-
with rich_utils.safe_status(
|
|
438
|
-
ux_utils.spinner_message('Initializing service')):
|
|
439
|
-
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
440
|
-
task, task_type='serve')
|
|
441
|
-
|
|
442
|
-
code = serve_utils.ServeCodeGen.add_version(service_name)
|
|
443
|
-
returncode, version_string_payload, stderr = backend.run_on_head(
|
|
444
|
-
handle,
|
|
445
|
-
code,
|
|
446
|
-
require_outputs=True,
|
|
447
|
-
stream_logs=False,
|
|
448
|
-
separate_stderr=True)
|
|
449
|
-
try:
|
|
450
|
-
subprocess_utils.handle_returncode(returncode,
|
|
451
|
-
code,
|
|
452
|
-
'Failed to add version',
|
|
453
|
-
stderr,
|
|
454
|
-
stream_logs=True)
|
|
455
|
-
except exceptions.CommandError as e:
|
|
456
|
-
raise RuntimeError(e.error_msg) from e
|
|
457
|
-
|
|
458
|
-
version_string = serve_utils.load_version_string(version_string_payload)
|
|
459
|
-
try:
|
|
460
|
-
current_version = int(version_string)
|
|
461
|
-
except ValueError as e:
|
|
462
|
-
with ux_utils.print_exception_no_traceback():
|
|
463
|
-
raise ValueError(f'Failed to parse version: {version_string}; '
|
|
464
|
-
f'Returncode: {returncode}') from e
|
|
465
|
-
|
|
466
|
-
print(f'New version: {current_version}')
|
|
467
|
-
with tempfile.NamedTemporaryFile(
|
|
468
|
-
prefix=f'{service_name}-v{current_version}',
|
|
469
|
-
mode='w') as service_file:
|
|
470
|
-
task_config = task.to_yaml_config()
|
|
471
|
-
common_utils.dump_yaml(service_file.name, task_config)
|
|
472
|
-
remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
473
|
-
service_name, current_version, expand_user=False)
|
|
474
|
-
|
|
475
|
-
backend.sync_file_mounts(handle,
|
|
476
|
-
{remote_task_yaml_path: service_file.name},
|
|
477
|
-
storage_mounts=None)
|
|
478
|
-
|
|
479
|
-
code = serve_utils.ServeCodeGen.update_service(service_name,
|
|
480
|
-
current_version,
|
|
481
|
-
mode=mode.value)
|
|
482
|
-
returncode, _, stderr = backend.run_on_head(handle,
|
|
483
|
-
code,
|
|
484
|
-
require_outputs=True,
|
|
485
|
-
stream_logs=False,
|
|
486
|
-
separate_stderr=True)
|
|
487
|
-
try:
|
|
488
|
-
subprocess_utils.handle_returncode(returncode,
|
|
489
|
-
code,
|
|
490
|
-
'Failed to update services',
|
|
491
|
-
stderr,
|
|
492
|
-
stream_logs=True)
|
|
493
|
-
except exceptions.CommandError as e:
|
|
494
|
-
raise RuntimeError(e.error_msg) from e
|
|
495
|
-
|
|
496
|
-
print(f'{colorama.Fore.GREEN}Service {service_name!r} update scheduled.'
|
|
497
|
-
f'{colorama.Style.RESET_ALL}\n'
|
|
498
|
-
f'Please use {ux_utils.BOLD}sky serve status {service_name} '
|
|
499
|
-
f'{ux_utils.RESET_BOLD}to check the latest status.')
|
|
65
|
+
return impl.update(task, service_name, mode, pool=False, workers=workers)
|
|
500
66
|
|
|
501
67
|
|
|
502
68
|
@usage_lib.entrypoint
|
|
@@ -521,46 +87,7 @@ def down(
|
|
|
521
87
|
ValueError: if the arguments are invalid.
|
|
522
88
|
RuntimeError: if failed to terminate the service.
|
|
523
89
|
"""
|
|
524
|
-
|
|
525
|
-
service_names = []
|
|
526
|
-
if isinstance(service_names, str):
|
|
527
|
-
service_names = [service_names]
|
|
528
|
-
handle = backend_utils.is_controller_accessible(
|
|
529
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
|
530
|
-
stopped_message='All services should have terminated.')
|
|
531
|
-
|
|
532
|
-
service_names_str = ','.join(service_names)
|
|
533
|
-
if sum([bool(service_names), all]) != 1:
|
|
534
|
-
argument_str = (f'service_names={service_names_str}'
|
|
535
|
-
if service_names else '')
|
|
536
|
-
argument_str += ' all' if all else ''
|
|
537
|
-
raise ValueError('Can only specify one of service_names or all. '
|
|
538
|
-
f'Provided {argument_str!r}.')
|
|
539
|
-
|
|
540
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
541
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
542
|
-
service_names = None if all else service_names
|
|
543
|
-
code = serve_utils.ServeCodeGen.terminate_services(service_names, purge)
|
|
544
|
-
|
|
545
|
-
try:
|
|
546
|
-
returncode, stdout, _ = backend.run_on_head(handle,
|
|
547
|
-
code,
|
|
548
|
-
require_outputs=True,
|
|
549
|
-
stream_logs=False)
|
|
550
|
-
except exceptions.FetchClusterInfoError as e:
|
|
551
|
-
raise RuntimeError(
|
|
552
|
-
'Failed to fetch controller IP. Please refresh controller status '
|
|
553
|
-
f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
|
|
554
|
-
'and try again.') from e
|
|
555
|
-
|
|
556
|
-
try:
|
|
557
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
558
|
-
'Failed to terminate service',
|
|
559
|
-
stdout)
|
|
560
|
-
except exceptions.CommandError as e:
|
|
561
|
-
raise RuntimeError(e.error_msg) from e
|
|
562
|
-
|
|
563
|
-
logger.info(stdout)
|
|
90
|
+
return impl.down(service_names, all, purge, pool=False)
|
|
564
91
|
|
|
565
92
|
|
|
566
93
|
@usage_lib.entrypoint
|
|
@@ -587,25 +114,37 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
|
|
587
114
|
'Please spin up a service first.',
|
|
588
115
|
)
|
|
589
116
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
117
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
118
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
119
|
+
|
|
120
|
+
if not use_legacy:
|
|
121
|
+
try:
|
|
122
|
+
stdout = serve_rpc_utils.RpcRunner.terminate_replica(
|
|
123
|
+
handle, service_name, replica_id, purge)
|
|
124
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
125
|
+
use_legacy = True
|
|
126
|
+
|
|
127
|
+
if use_legacy:
|
|
128
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
129
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
130
|
+
|
|
131
|
+
code = serve_utils.ServeCodeGen.terminate_replica(
|
|
132
|
+
service_name, replica_id, purge)
|
|
133
|
+
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
134
|
+
code,
|
|
135
|
+
require_outputs=True,
|
|
136
|
+
stream_logs=False,
|
|
137
|
+
separate_stderr=True)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
subprocess_utils.handle_returncode(
|
|
141
|
+
returncode,
|
|
142
|
+
code,
|
|
143
|
+
'Failed to terminate the replica',
|
|
144
|
+
stderr,
|
|
145
|
+
stream_logs=True)
|
|
146
|
+
except exceptions.CommandError as e:
|
|
147
|
+
raise RuntimeError(e.error_msg) from e
|
|
609
148
|
|
|
610
149
|
sky_logging.print(stdout)
|
|
611
150
|
|
|
@@ -669,60 +208,7 @@ def status(
|
|
|
669
208
|
RuntimeError: if failed to get the service status.
|
|
670
209
|
exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
|
671
210
|
"""
|
|
672
|
-
|
|
673
|
-
if isinstance(service_names, str):
|
|
674
|
-
service_names = [service_names]
|
|
675
|
-
|
|
676
|
-
try:
|
|
677
|
-
backend_utils.check_network_connection()
|
|
678
|
-
except exceptions.NetworkError as e:
|
|
679
|
-
with ux_utils.print_exception_no_traceback():
|
|
680
|
-
raise RuntimeError(
|
|
681
|
-
'Failed to refresh service status due to network error.') from e
|
|
682
|
-
|
|
683
|
-
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
|
684
|
-
handle = backend_utils.is_controller_accessible(
|
|
685
|
-
controller=controller_type,
|
|
686
|
-
stopped_message=controller_type.value.default_hint_if_non_existent)
|
|
687
|
-
|
|
688
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
689
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
690
|
-
|
|
691
|
-
code = serve_utils.ServeCodeGen.get_service_status(service_names)
|
|
692
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
693
|
-
handle,
|
|
694
|
-
code,
|
|
695
|
-
require_outputs=True,
|
|
696
|
-
stream_logs=False,
|
|
697
|
-
separate_stderr=True)
|
|
698
|
-
|
|
699
|
-
try:
|
|
700
|
-
subprocess_utils.handle_returncode(returncode,
|
|
701
|
-
code,
|
|
702
|
-
'Failed to fetch services',
|
|
703
|
-
stderr,
|
|
704
|
-
stream_logs=True)
|
|
705
|
-
except exceptions.CommandError as e:
|
|
706
|
-
raise RuntimeError(e.error_msg) from e
|
|
707
|
-
|
|
708
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
709
|
-
# Get the endpoint for each service
|
|
710
|
-
for service_record in service_records:
|
|
711
|
-
service_record['endpoint'] = None
|
|
712
|
-
if service_record['load_balancer_port'] is not None:
|
|
713
|
-
try:
|
|
714
|
-
endpoint = backend_utils.get_endpoints(
|
|
715
|
-
cluster=common.SKY_SERVE_CONTROLLER_NAME,
|
|
716
|
-
port=service_record['load_balancer_port']).get(
|
|
717
|
-
service_record['load_balancer_port'], None)
|
|
718
|
-
except exceptions.ClusterNotUpError:
|
|
719
|
-
pass
|
|
720
|
-
else:
|
|
721
|
-
protocol = ('https'
|
|
722
|
-
if service_record['tls_encrypted'] else 'http')
|
|
723
|
-
service_record['endpoint'] = f'{protocol}://{endpoint}'
|
|
724
|
-
|
|
725
|
-
return service_records
|
|
211
|
+
return impl.status(service_names, pool=False)
|
|
726
212
|
|
|
727
213
|
|
|
728
214
|
ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
|
|
@@ -735,6 +221,7 @@ def tail_logs(
|
|
|
735
221
|
target: ServiceComponentOrStr,
|
|
736
222
|
replica_id: Optional[int] = None,
|
|
737
223
|
follow: bool = True,
|
|
224
|
+
tail: Optional[int] = None,
|
|
738
225
|
) -> None:
|
|
739
226
|
"""Tails logs for a service.
|
|
740
227
|
|
|
@@ -769,56 +256,12 @@ def tail_logs(
|
|
|
769
256
|
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
|
770
257
|
ValueError: arguments not valid, or failed to tail the logs.
|
|
771
258
|
"""
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
if target == serve_utils.ServiceComponent.REPLICA:
|
|
780
|
-
if replica_id is None:
|
|
781
|
-
with ux_utils.print_exception_no_traceback():
|
|
782
|
-
raise ValueError(
|
|
783
|
-
'`replica_id` must be specified when using target=REPLICA.')
|
|
784
|
-
else:
|
|
785
|
-
if replica_id is not None:
|
|
786
|
-
with ux_utils.print_exception_no_traceback():
|
|
787
|
-
raise ValueError('`replica_id` must be None when using '
|
|
788
|
-
'target=CONTROLLER/LOAD_BALANCER.')
|
|
789
|
-
|
|
790
|
-
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
|
791
|
-
handle = backend_utils.is_controller_accessible(
|
|
792
|
-
controller=controller_type,
|
|
793
|
-
stopped_message=controller_type.value.default_hint_if_non_existent)
|
|
794
|
-
|
|
795
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
796
|
-
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
|
797
|
-
|
|
798
|
-
if target != serve_utils.ServiceComponent.REPLICA:
|
|
799
|
-
code = serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
800
|
-
service_name,
|
|
801
|
-
stream_controller=(
|
|
802
|
-
target == serve_utils.ServiceComponent.CONTROLLER),
|
|
803
|
-
follow=follow)
|
|
804
|
-
else:
|
|
805
|
-
assert replica_id is not None, service_name
|
|
806
|
-
code = serve_utils.ServeCodeGen.stream_replica_logs(
|
|
807
|
-
service_name, replica_id, follow)
|
|
808
|
-
|
|
809
|
-
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
|
810
|
-
# kill the process, so we need to handle it manually here.
|
|
811
|
-
if threading.current_thread() is threading.main_thread():
|
|
812
|
-
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
|
813
|
-
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
|
814
|
-
|
|
815
|
-
# Refer to the notes in
|
|
816
|
-
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
|
817
|
-
backend.run_on_head(handle,
|
|
818
|
-
code,
|
|
819
|
-
stream_logs=True,
|
|
820
|
-
process_stream=False,
|
|
821
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE)
|
|
259
|
+
return impl.tail_logs(service_name,
|
|
260
|
+
target=target,
|
|
261
|
+
replica_id=replica_id,
|
|
262
|
+
follow=follow,
|
|
263
|
+
tail=tail,
|
|
264
|
+
pool=False)
|
|
822
265
|
|
|
823
266
|
|
|
824
267
|
@usage_lib.entrypoint
|
|
@@ -829,6 +272,7 @@ def sync_down_logs(
|
|
|
829
272
|
targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
|
|
830
273
|
None] = None,
|
|
831
274
|
replica_ids: Optional[List[int]] = None,
|
|
275
|
+
tail: Optional[int] = None,
|
|
832
276
|
) -> str:
|
|
833
277
|
"""Sync down logs from the controller for the given service.
|
|
834
278
|
|
|
@@ -862,98 +306,9 @@ def sync_down_logs(
|
|
|
862
306
|
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
|
863
307
|
ValueError: Arguments not valid.
|
|
864
308
|
"""
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
stopped_message=controller_type.value.default_hint_if_non_existent)
|
|
872
|
-
backend: backends.CloudVmRayBackend = (
|
|
873
|
-
backend_utils.get_backend_from_handle(handle))
|
|
874
|
-
|
|
875
|
-
requested_components: Set[serve_utils.ServiceComponent] = set()
|
|
876
|
-
if not targets:
|
|
877
|
-
# No targets specified -> request all components
|
|
878
|
-
requested_components = {
|
|
879
|
-
serve_utils.ServiceComponent.CONTROLLER,
|
|
880
|
-
serve_utils.ServiceComponent.LOAD_BALANCER,
|
|
881
|
-
serve_utils.ServiceComponent.REPLICA
|
|
882
|
-
}
|
|
883
|
-
else:
|
|
884
|
-
# Parse provided targets
|
|
885
|
-
if isinstance(targets, (str, serve_utils.ServiceComponent)):
|
|
886
|
-
requested_components = {serve_utils.ServiceComponent(targets)}
|
|
887
|
-
else: # list
|
|
888
|
-
requested_components = {
|
|
889
|
-
serve_utils.ServiceComponent(t) for t in targets
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
|
|
893
|
-
if serve_utils.ServiceComponent.CONTROLLER in requested_components:
|
|
894
|
-
normalized_targets.add(
|
|
895
|
-
serve_utils.ServiceComponentTarget(
|
|
896
|
-
serve_utils.ServiceComponent.CONTROLLER))
|
|
897
|
-
if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
|
|
898
|
-
normalized_targets.add(
|
|
899
|
-
serve_utils.ServiceComponentTarget(
|
|
900
|
-
serve_utils.ServiceComponent.LOAD_BALANCER))
|
|
901
|
-
if serve_utils.ServiceComponent.REPLICA in requested_components:
|
|
902
|
-
with rich_utils.safe_status(
|
|
903
|
-
ux_utils.spinner_message('Getting live replica infos...')):
|
|
904
|
-
replica_targets = _get_all_replica_targets(service_name, backend,
|
|
905
|
-
handle)
|
|
906
|
-
if not replica_ids:
|
|
907
|
-
# Replica target requested but no specific IDs
|
|
908
|
-
# -> Get all replica logs
|
|
909
|
-
normalized_targets.update(replica_targets)
|
|
910
|
-
else:
|
|
911
|
-
# Replica target requested with specific IDs
|
|
912
|
-
requested_replica_targets = [
|
|
913
|
-
serve_utils.ServiceComponentTarget(
|
|
914
|
-
serve_utils.ServiceComponent.REPLICA, rid)
|
|
915
|
-
for rid in replica_ids
|
|
916
|
-
]
|
|
917
|
-
for target in requested_replica_targets:
|
|
918
|
-
if target not in replica_targets:
|
|
919
|
-
logger.warning(f'Replica ID {target.replica_id} not found '
|
|
920
|
-
f'for {service_name}. Skipping...')
|
|
921
|
-
else:
|
|
922
|
-
normalized_targets.add(target)
|
|
923
|
-
|
|
924
|
-
def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
|
|
925
|
-
component = target.component
|
|
926
|
-
# We need to set one side of the pipe to a logs stream, and the other
|
|
927
|
-
# side to a file.
|
|
928
|
-
log_path = str(pathlib.Path(local_dir) / f'{target}.log')
|
|
929
|
-
stream_logs_code: str
|
|
930
|
-
|
|
931
|
-
if component == serve_utils.ServiceComponent.CONTROLLER:
|
|
932
|
-
stream_logs_code = (
|
|
933
|
-
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
934
|
-
service_name, stream_controller=True, follow=False))
|
|
935
|
-
elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
|
|
936
|
-
stream_logs_code = (
|
|
937
|
-
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
|
938
|
-
service_name, stream_controller=False, follow=False))
|
|
939
|
-
elif component == serve_utils.ServiceComponent.REPLICA:
|
|
940
|
-
replica_id = target.replica_id
|
|
941
|
-
assert replica_id is not None, service_name
|
|
942
|
-
stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
|
|
943
|
-
service_name, replica_id, follow=False)
|
|
944
|
-
else:
|
|
945
|
-
assert False, component
|
|
946
|
-
|
|
947
|
-
# Refer to the notes in
|
|
948
|
-
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
|
949
|
-
backend.run_on_head(handle,
|
|
950
|
-
stream_logs_code,
|
|
951
|
-
stream_logs=False,
|
|
952
|
-
process_stream=False,
|
|
953
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
|
954
|
-
log_path=log_path)
|
|
955
|
-
|
|
956
|
-
subprocess_utils.run_in_parallel(sync_down_logs_by_target,
|
|
957
|
-
list(normalized_targets))
|
|
958
|
-
|
|
959
|
-
return local_dir
|
|
309
|
+
return impl.sync_down_logs(service_name,
|
|
310
|
+
local_dir=local_dir,
|
|
311
|
+
targets=targets,
|
|
312
|
+
replica_ids=replica_ids,
|
|
313
|
+
tail=tail,
|
|
314
|
+
pool=False)
|