skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/server/impl.py
CHANGED
|
@@ -5,6 +5,7 @@ import shlex
|
|
|
5
5
|
import signal
|
|
6
6
|
import tempfile
|
|
7
7
|
import threading
|
|
8
|
+
import typing
|
|
8
9
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
9
10
|
import uuid
|
|
10
11
|
|
|
@@ -17,12 +18,15 @@ from sky import execution
|
|
|
17
18
|
from sky import sky_logging
|
|
18
19
|
from sky import skypilot_config
|
|
19
20
|
from sky import task as task_lib
|
|
21
|
+
from sky.adaptors import common as adaptors_common
|
|
20
22
|
from sky.backends import backend_utils
|
|
21
23
|
from sky.catalog import common as service_catalog_common
|
|
22
24
|
from sky.data import storage as storage_lib
|
|
23
25
|
from sky.serve import constants as serve_constants
|
|
26
|
+
from sky.serve import serve_rpc_utils
|
|
24
27
|
from sky.serve import serve_state
|
|
25
28
|
from sky.serve import serve_utils
|
|
29
|
+
from sky.server.requests import request_names
|
|
26
30
|
from sky.skylet import constants
|
|
27
31
|
from sky.skylet import job_lib
|
|
28
32
|
from sky.utils import admin_policy_utils
|
|
@@ -36,6 +40,11 @@ from sky.utils import subprocess_utils
|
|
|
36
40
|
from sky.utils import ux_utils
|
|
37
41
|
from sky.utils import yaml_utils
|
|
38
42
|
|
|
43
|
+
if typing.TYPE_CHECKING:
|
|
44
|
+
import grpc
|
|
45
|
+
else:
|
|
46
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
47
|
+
|
|
39
48
|
logger = sky_logging.init_logger(__name__)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -78,24 +87,35 @@ def _get_service_record(
|
|
|
78
87
|
"""Get the service record."""
|
|
79
88
|
noun = 'pool' if pool else 'service'
|
|
80
89
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
84
|
-
handle,
|
|
85
|
-
code,
|
|
86
|
-
require_outputs=True,
|
|
87
|
-
stream_logs=False,
|
|
88
|
-
separate_stderr=True)
|
|
89
|
-
try:
|
|
90
|
-
subprocess_utils.handle_returncode(returncode,
|
|
91
|
-
code,
|
|
92
|
-
f'Failed to get {noun} status',
|
|
93
|
-
stderr,
|
|
94
|
-
stream_logs=True)
|
|
95
|
-
except exceptions.CommandError as e:
|
|
96
|
-
raise RuntimeError(e.error_msg) from e
|
|
90
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
91
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
97
92
|
|
|
98
|
-
|
|
93
|
+
if not use_legacy:
|
|
94
|
+
try:
|
|
95
|
+
service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
|
|
96
|
+
handle, [service_name], pool)
|
|
97
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
98
|
+
use_legacy = True
|
|
99
|
+
|
|
100
|
+
if use_legacy:
|
|
101
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name],
|
|
102
|
+
pool=pool)
|
|
103
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
104
|
+
handle,
|
|
105
|
+
code,
|
|
106
|
+
require_outputs=True,
|
|
107
|
+
stream_logs=False,
|
|
108
|
+
separate_stderr=True)
|
|
109
|
+
try:
|
|
110
|
+
subprocess_utils.handle_returncode(returncode,
|
|
111
|
+
code,
|
|
112
|
+
f'Failed to get {noun} status',
|
|
113
|
+
stderr,
|
|
114
|
+
stream_logs=True)
|
|
115
|
+
except exceptions.CommandError as e:
|
|
116
|
+
raise RuntimeError(e.error_msg) from e
|
|
117
|
+
|
|
118
|
+
service_statuses = serve_utils.load_service_status(serve_status_payload)
|
|
99
119
|
|
|
100
120
|
assert len(service_statuses) <= 1, service_statuses
|
|
101
121
|
if not service_statuses:
|
|
@@ -103,6 +123,18 @@ def _get_service_record(
|
|
|
103
123
|
return service_statuses[0]
|
|
104
124
|
|
|
105
125
|
|
|
126
|
+
def _maybe_display_run_warning(task: 'task_lib.Task') -> None:
|
|
127
|
+
# We do not block the user from creating a pool with a run section
|
|
128
|
+
# in order to enable using the same yaml for pool creation
|
|
129
|
+
# and job submission. But we want to make it clear that 'run' will not
|
|
130
|
+
# be respected here.
|
|
131
|
+
if task.run is not None:
|
|
132
|
+
logger.warning(
|
|
133
|
+
f'{colorama.Fore.YELLOW} Pool creation does not support the '
|
|
134
|
+
'`run` section. Creating the pool while ignoring the '
|
|
135
|
+
f'`run` section.{colorama.Style.RESET_ALL}')
|
|
136
|
+
|
|
137
|
+
|
|
106
138
|
def up(
|
|
107
139
|
task: 'task_lib.Task',
|
|
108
140
|
service_name: Optional[str] = None,
|
|
@@ -133,16 +165,15 @@ def up(
|
|
|
133
165
|
# Always apply the policy again here, even though it might have been applied
|
|
134
166
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
135
167
|
# and get the mutated config.
|
|
136
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
|
168
|
+
dag, mutated_user_config = admin_policy_utils.apply(
|
|
169
|
+
dag, request_name=request_names.AdminPolicyRequestName.SERVE_UP)
|
|
137
170
|
dag.resolve_and_validate_volumes()
|
|
138
171
|
dag.pre_mount_volumes()
|
|
139
172
|
task = dag.tasks[0]
|
|
140
173
|
assert task.service is not None
|
|
141
174
|
if pool:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
145
|
-
# Use dummy run script for cluster pool.
|
|
175
|
+
_maybe_display_run_warning(task)
|
|
176
|
+
# Use dummy run script for pool.
|
|
146
177
|
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
147
178
|
|
|
148
179
|
with rich_utils.safe_status(
|
|
@@ -258,6 +289,8 @@ def up(
|
|
|
258
289
|
task=controller_task,
|
|
259
290
|
cluster_name=controller_name,
|
|
260
291
|
retry_until_up=True,
|
|
292
|
+
_request_name=request_names.AdminPolicyRequestName.
|
|
293
|
+
SERVE_LAUNCH_CONTROLLER,
|
|
261
294
|
_disable_controller_check=True,
|
|
262
295
|
)
|
|
263
296
|
else:
|
|
@@ -280,38 +313,51 @@ def up(
|
|
|
280
313
|
]
|
|
281
314
|
run_script = '\n'.join(env_cmds + [run_script])
|
|
282
315
|
# Dump script for high availability recovery.
|
|
283
|
-
|
|
284
|
-
serve_state.set_ha_recovery_script(service_name, run_script)
|
|
316
|
+
serve_state.set_ha_recovery_script(service_name, run_script)
|
|
285
317
|
backend.run_on_head(controller_handle, run_script)
|
|
286
318
|
|
|
287
319
|
style = colorama.Style
|
|
288
320
|
fore = colorama.Fore
|
|
289
321
|
|
|
290
322
|
assert controller_job_id is not None and controller_handle is not None
|
|
323
|
+
assert isinstance(controller_handle, backends.CloudVmRayResourceHandle)
|
|
324
|
+
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
325
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
291
326
|
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
|
|
292
327
|
# change after the first time, so there is no consistency issue.
|
|
293
|
-
with rich_utils.safe_status(
|
|
294
|
-
ux_utils.spinner_message(
|
|
295
|
-
f'Waiting for the {noun} to register')):
|
|
296
|
-
# This function will check the controller job id in the database
|
|
297
|
-
# and return the endpoint if the job id matches. Otherwise it will
|
|
298
|
-
# return None.
|
|
299
|
-
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
300
|
-
service_name, controller_job_id, pool)
|
|
301
|
-
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
302
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
303
|
-
assert isinstance(controller_handle,
|
|
304
|
-
backends.CloudVmRayResourceHandle)
|
|
305
|
-
returncode, lb_port_payload, _ = backend.run_on_head(
|
|
306
|
-
controller_handle,
|
|
307
|
-
code,
|
|
308
|
-
require_outputs=True,
|
|
309
|
-
stream_logs=False)
|
|
310
328
|
try:
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
329
|
+
with rich_utils.safe_status(
|
|
330
|
+
ux_utils.spinner_message(
|
|
331
|
+
f'Waiting for the {noun} to register')):
|
|
332
|
+
# This function will check the controller job id in the database
|
|
333
|
+
# and return the endpoint if the job id matches. Otherwise it
|
|
334
|
+
# will return None.
|
|
335
|
+
use_legacy = not controller_handle.is_grpc_enabled_with_flag
|
|
336
|
+
|
|
337
|
+
if controller_handle.is_grpc_enabled_with_flag:
|
|
338
|
+
try:
|
|
339
|
+
lb_port = serve_rpc_utils.RpcRunner.wait_service_registration( # pylint: disable=line-too-long
|
|
340
|
+
controller_handle, service_name, controller_job_id,
|
|
341
|
+
pool)
|
|
342
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
343
|
+
use_legacy = True
|
|
344
|
+
|
|
345
|
+
if use_legacy:
|
|
346
|
+
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
347
|
+
service_name, controller_job_id, pool)
|
|
348
|
+
returncode, lb_port_payload, _ = backend.run_on_head(
|
|
349
|
+
controller_handle,
|
|
350
|
+
code,
|
|
351
|
+
require_outputs=True,
|
|
352
|
+
stream_logs=False)
|
|
353
|
+
subprocess_utils.handle_returncode(
|
|
354
|
+
returncode, code,
|
|
355
|
+
f'Failed to wait for {noun} initialization',
|
|
356
|
+
lb_port_payload)
|
|
357
|
+
lb_port = serve_utils.load_service_initialization_result(
|
|
358
|
+
lb_port_payload)
|
|
359
|
+
except (exceptions.CommandError, grpc.FutureTimeoutError,
|
|
360
|
+
grpc.RpcError):
|
|
315
361
|
if serve_utils.is_consolidation_mode(pool):
|
|
316
362
|
with ux_utils.print_exception_no_traceback():
|
|
317
363
|
raise RuntimeError(
|
|
@@ -345,8 +391,6 @@ def up(
|
|
|
345
391
|
'Failed to spin up the service. Please '
|
|
346
392
|
'check the logs above for more details.') from None
|
|
347
393
|
else:
|
|
348
|
-
lb_port = serve_utils.load_service_initialization_result(
|
|
349
|
-
lb_port_payload)
|
|
350
394
|
if not serve_utils.is_consolidation_mode(pool) and not pool:
|
|
351
395
|
socket_endpoint = backend_utils.get_endpoints(
|
|
352
396
|
controller_handle.cluster_name,
|
|
@@ -381,6 +425,9 @@ def up(
|
|
|
381
425
|
f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
|
|
382
426
|
f'{ux_utils.BOLD}sky jobs pool down {service_name}'
|
|
383
427
|
f'{ux_utils.RESET_BOLD}'
|
|
428
|
+
f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
|
|
429
|
+
f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
|
|
430
|
+
f'--workers 5{ux_utils.RESET_BOLD}'
|
|
384
431
|
'\n\n' + ux_utils.finishing_message('Successfully created pool '
|
|
385
432
|
f'{service_name!r}.'))
|
|
386
433
|
else:
|
|
@@ -418,37 +465,15 @@ def up(
|
|
|
418
465
|
|
|
419
466
|
|
|
420
467
|
def update(
|
|
421
|
-
task: 'task_lib.Task',
|
|
468
|
+
task: Optional['task_lib.Task'],
|
|
422
469
|
service_name: str,
|
|
423
470
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
424
471
|
pool: bool = False,
|
|
472
|
+
workers: Optional[int] = None,
|
|
425
473
|
) -> None:
|
|
426
474
|
"""Updates an existing service or pool."""
|
|
427
475
|
noun = 'pool' if pool else 'service'
|
|
428
476
|
capnoun = noun.capitalize()
|
|
429
|
-
task.validate()
|
|
430
|
-
serve_utils.validate_service_task(task, pool=pool)
|
|
431
|
-
|
|
432
|
-
# Always apply the policy again here, even though it might have been applied
|
|
433
|
-
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
434
|
-
# and get the mutated config.
|
|
435
|
-
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
436
|
-
# will not apply the config.
|
|
437
|
-
dag, _ = admin_policy_utils.apply(task)
|
|
438
|
-
task = dag.tasks[0]
|
|
439
|
-
if pool:
|
|
440
|
-
if task.run is not None:
|
|
441
|
-
logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
|
|
442
|
-
f'ignored for pool.{colorama.Style.RESET_ALL}')
|
|
443
|
-
# Use dummy run script for cluster pool.
|
|
444
|
-
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
445
|
-
|
|
446
|
-
assert task.service is not None
|
|
447
|
-
if not pool and task.service.tls_credential is not None:
|
|
448
|
-
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
449
|
-
'Any updates to the keyfile and certfile will not take '
|
|
450
|
-
'effect. To update TLS keyfile and certfile, please '
|
|
451
|
-
'tear down the service and spin up a new one.')
|
|
452
477
|
|
|
453
478
|
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
454
479
|
handle = backend_utils.is_controller_accessible(
|
|
@@ -462,6 +487,7 @@ def update(
|
|
|
462
487
|
f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
|
|
463
488
|
)
|
|
464
489
|
|
|
490
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
465
491
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
466
492
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
467
493
|
|
|
@@ -474,6 +500,58 @@ def update(
|
|
|
474
500
|
f'To spin up a {noun}, use {ux_utils.BOLD}'
|
|
475
501
|
f'{cmd}{ux_utils.RESET_BOLD}')
|
|
476
502
|
|
|
503
|
+
# If task is None and workers is specified, load existing configuration
|
|
504
|
+
# and update replica count.
|
|
505
|
+
if task is None:
|
|
506
|
+
if workers is None:
|
|
507
|
+
with ux_utils.print_exception_no_traceback():
|
|
508
|
+
raise ValueError(
|
|
509
|
+
f'Cannot update {noun} without specifying '
|
|
510
|
+
f'task or workers. Please provide either a task '
|
|
511
|
+
f'or specify the number of workers.')
|
|
512
|
+
|
|
513
|
+
if not pool:
|
|
514
|
+
with ux_utils.print_exception_no_traceback():
|
|
515
|
+
raise ValueError(
|
|
516
|
+
'Non-pool service, trying to update replicas to '
|
|
517
|
+
f'{workers} is not supported. Ignoring the update.')
|
|
518
|
+
|
|
519
|
+
# Load the existing task configuration from the service's YAML file
|
|
520
|
+
yaml_content = service_record['yaml_content']
|
|
521
|
+
|
|
522
|
+
# Load the existing task configuration
|
|
523
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
524
|
+
|
|
525
|
+
if task.service is None:
|
|
526
|
+
with ux_utils.print_exception_no_traceback():
|
|
527
|
+
raise RuntimeError('No service configuration found in '
|
|
528
|
+
f'existing {noun} {service_name!r}')
|
|
529
|
+
task.set_service(task.service.copy(min_replicas=workers))
|
|
530
|
+
|
|
531
|
+
task.validate()
|
|
532
|
+
serve_utils.validate_service_task(task, pool=pool)
|
|
533
|
+
|
|
534
|
+
# Now apply the policy and handle task-specific logic
|
|
535
|
+
# Always apply the policy again here, even though it might have been applied
|
|
536
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
537
|
+
# and get the mutated config.
|
|
538
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
|
539
|
+
# will not apply the config.
|
|
540
|
+
dag, _ = admin_policy_utils.apply(
|
|
541
|
+
task, request_name=request_names.AdminPolicyRequestName.SERVE_UPDATE)
|
|
542
|
+
task = dag.tasks[0]
|
|
543
|
+
if pool:
|
|
544
|
+
_maybe_display_run_warning(task)
|
|
545
|
+
# Use dummy run script for pool.
|
|
546
|
+
task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
|
|
547
|
+
|
|
548
|
+
assert task.service is not None
|
|
549
|
+
if not pool and task.service.tls_credential is not None:
|
|
550
|
+
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
|
551
|
+
'Any updates to the keyfile and certfile will not take '
|
|
552
|
+
'effect. To update TLS keyfile and certfile, please '
|
|
553
|
+
'tear down the service and spin up a new one.')
|
|
554
|
+
|
|
477
555
|
prompt = None
|
|
478
556
|
if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
|
|
479
557
|
):
|
|
@@ -504,29 +582,39 @@ def update(
|
|
|
504
582
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
505
583
|
task, task_type='serve')
|
|
506
584
|
|
|
507
|
-
|
|
508
|
-
returncode, version_string_payload, stderr = backend.run_on_head(
|
|
509
|
-
handle,
|
|
510
|
-
code,
|
|
511
|
-
require_outputs=True,
|
|
512
|
-
stream_logs=False,
|
|
513
|
-
separate_stderr=True)
|
|
514
|
-
try:
|
|
515
|
-
subprocess_utils.handle_returncode(returncode,
|
|
516
|
-
code,
|
|
517
|
-
'Failed to add version',
|
|
518
|
-
stderr,
|
|
519
|
-
stream_logs=True)
|
|
520
|
-
except exceptions.CommandError as e:
|
|
521
|
-
raise RuntimeError(e.error_msg) from e
|
|
585
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
522
586
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
587
|
+
if not use_legacy:
|
|
588
|
+
try:
|
|
589
|
+
current_version = serve_rpc_utils.RpcRunner.add_version(
|
|
590
|
+
handle, service_name)
|
|
591
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
592
|
+
use_legacy = True
|
|
593
|
+
|
|
594
|
+
if use_legacy:
|
|
595
|
+
code = serve_utils.ServeCodeGen.add_version(service_name)
|
|
596
|
+
returncode, version_string_payload, stderr = backend.run_on_head(
|
|
597
|
+
handle,
|
|
598
|
+
code,
|
|
599
|
+
require_outputs=True,
|
|
600
|
+
stream_logs=False,
|
|
601
|
+
separate_stderr=True)
|
|
602
|
+
try:
|
|
603
|
+
subprocess_utils.handle_returncode(returncode,
|
|
604
|
+
code,
|
|
605
|
+
'Failed to add version',
|
|
606
|
+
stderr,
|
|
607
|
+
stream_logs=True)
|
|
608
|
+
except exceptions.CommandError as e:
|
|
609
|
+
raise RuntimeError(e.error_msg) from e
|
|
610
|
+
|
|
611
|
+
version_string = serve_utils.load_version_string(version_string_payload)
|
|
612
|
+
try:
|
|
613
|
+
current_version = int(version_string)
|
|
614
|
+
except ValueError as e:
|
|
615
|
+
with ux_utils.print_exception_no_traceback():
|
|
616
|
+
raise ValueError(f'Failed to parse version: {version_string}; '
|
|
617
|
+
f'Returncode: {returncode}') from e
|
|
530
618
|
|
|
531
619
|
with tempfile.NamedTemporaryFile(
|
|
532
620
|
prefix=f'{service_name}-v{current_version}',
|
|
@@ -541,23 +629,33 @@ def update(
|
|
|
541
629
|
{remote_task_yaml_path: service_file.name},
|
|
542
630
|
storage_mounts=None)
|
|
543
631
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
632
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
633
|
+
|
|
634
|
+
if not use_legacy:
|
|
635
|
+
try:
|
|
636
|
+
serve_rpc_utils.RpcRunner.update_service(
|
|
637
|
+
handle, service_name, current_version, mode, pool)
|
|
638
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
639
|
+
use_legacy = True
|
|
640
|
+
|
|
641
|
+
if use_legacy:
|
|
642
|
+
code = serve_utils.ServeCodeGen.update_service(service_name,
|
|
643
|
+
current_version,
|
|
644
|
+
mode=mode.value,
|
|
645
|
+
pool=pool)
|
|
646
|
+
returncode, _, stderr = backend.run_on_head(handle,
|
|
647
|
+
code,
|
|
648
|
+
require_outputs=True,
|
|
649
|
+
stream_logs=False,
|
|
650
|
+
separate_stderr=True)
|
|
651
|
+
try:
|
|
652
|
+
subprocess_utils.handle_returncode(returncode,
|
|
653
|
+
code,
|
|
654
|
+
f'Failed to update {noun}s',
|
|
655
|
+
stderr,
|
|
656
|
+
stream_logs=True)
|
|
657
|
+
except exceptions.CommandError as e:
|
|
658
|
+
raise RuntimeError(e.error_msg) from e
|
|
561
659
|
|
|
562
660
|
cmd = 'sky jobs pool status' if pool else 'sky serve status'
|
|
563
661
|
logger.info(
|
|
@@ -566,14 +664,25 @@ def update(
|
|
|
566
664
|
f'Please use {ux_utils.BOLD}{cmd} {service_name} '
|
|
567
665
|
f'{ux_utils.RESET_BOLD}to check the latest status.')
|
|
568
666
|
|
|
667
|
+
if pool:
|
|
668
|
+
logs_cmd = f'`sky jobs pool logs {service_name} <worker_id>`'
|
|
669
|
+
unit_noun = 'Workers'
|
|
670
|
+
|
|
671
|
+
else:
|
|
672
|
+
logs_cmd = f'`sky serve logs {service_name} <replica_id>`'
|
|
673
|
+
unit_noun = 'Replicas'
|
|
569
674
|
logger.info(
|
|
570
675
|
ux_utils.finishing_message(
|
|
571
676
|
f'Successfully updated {noun} {service_name!r} '
|
|
572
|
-
f'to version {current_version}.'
|
|
677
|
+
f'to version {current_version}.',
|
|
678
|
+
follow_up_message=
|
|
679
|
+
f'\n{unit_noun} are updating, use {ux_utils.BOLD}{logs_cmd}'
|
|
680
|
+
f'{ux_utils.RESET_BOLD} to check their status.'))
|
|
573
681
|
|
|
574
682
|
|
|
575
683
|
def apply(
|
|
576
684
|
task: 'task_lib.Task',
|
|
685
|
+
workers: Optional[int],
|
|
577
686
|
service_name: str,
|
|
578
687
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
579
688
|
pool: bool = False,
|
|
@@ -589,7 +698,7 @@ def apply(
|
|
|
589
698
|
service_record = _get_service_record(service_name, pool, handle,
|
|
590
699
|
backend)
|
|
591
700
|
if service_record is not None:
|
|
592
|
-
return update(task, service_name, mode, pool)
|
|
701
|
+
return update(task, service_name, mode, pool, workers)
|
|
593
702
|
except exceptions.ClusterNotUpError:
|
|
594
703
|
pass
|
|
595
704
|
up(task, service_name, pool)
|
|
@@ -620,29 +729,44 @@ def down(
|
|
|
620
729
|
raise ValueError(f'Can only specify one of {noun}_names or all. '
|
|
621
730
|
f'Provided {argument_str!r}.')
|
|
622
731
|
|
|
623
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
624
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
625
732
|
service_names = None if all else service_names
|
|
626
|
-
code = serve_utils.ServeCodeGen.terminate_services(service_names, purge,
|
|
627
|
-
pool)
|
|
628
733
|
|
|
629
734
|
try:
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
735
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
736
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
737
|
+
|
|
738
|
+
if not use_legacy:
|
|
739
|
+
try:
|
|
740
|
+
stdout = serve_rpc_utils.RpcRunner.terminate_services(
|
|
741
|
+
handle, service_names, purge, pool)
|
|
742
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
743
|
+
use_legacy = True
|
|
744
|
+
|
|
745
|
+
if use_legacy:
|
|
746
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
747
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
748
|
+
code = serve_utils.ServeCodeGen.terminate_services(
|
|
749
|
+
service_names, purge, pool)
|
|
750
|
+
|
|
751
|
+
returncode, stdout, _ = backend.run_on_head(handle,
|
|
752
|
+
code,
|
|
753
|
+
require_outputs=True,
|
|
754
|
+
stream_logs=False)
|
|
755
|
+
|
|
756
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
757
|
+
f'Failed to terminate {noun}',
|
|
758
|
+
stdout)
|
|
634
759
|
except exceptions.FetchClusterInfoError as e:
|
|
635
760
|
raise RuntimeError(
|
|
636
761
|
'Failed to fetch controller IP. Please refresh controller status '
|
|
637
|
-
f'by `sky status -r {controller_type.value.cluster_name}` '
|
|
638
|
-
'
|
|
639
|
-
|
|
640
|
-
try:
|
|
641
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
642
|
-
f'Failed to terminate {noun}',
|
|
643
|
-
stdout)
|
|
762
|
+
f'by `sky status -r {controller_type.value.cluster_name}` and try '
|
|
763
|
+
'again.') from e
|
|
644
764
|
except exceptions.CommandError as e:
|
|
645
765
|
raise RuntimeError(e.error_msg) from e
|
|
766
|
+
except grpc.RpcError as e:
|
|
767
|
+
raise RuntimeError(f'{e.details()} ({e.code()})') from e
|
|
768
|
+
except grpc.FutureTimeoutError as e:
|
|
769
|
+
raise RuntimeError('gRPC timed out') from e
|
|
646
770
|
|
|
647
771
|
logger.info(stdout)
|
|
648
772
|
|
|
@@ -670,27 +794,40 @@ def status(
|
|
|
670
794
|
stopped_message=controller_type.value.default_hint_if_non_existent.
|
|
671
795
|
replace('service', noun))
|
|
672
796
|
|
|
673
|
-
|
|
674
|
-
|
|
797
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
798
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
675
799
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
800
|
+
if not use_legacy:
|
|
801
|
+
try:
|
|
802
|
+
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
803
|
+
handle, service_names, pool)
|
|
804
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
805
|
+
use_legacy = True
|
|
806
|
+
|
|
807
|
+
if use_legacy:
|
|
808
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
809
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
810
|
+
|
|
811
|
+
code = serve_utils.ServeCodeGen.get_service_status(service_names,
|
|
812
|
+
pool=pool)
|
|
813
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
814
|
+
handle,
|
|
815
|
+
code,
|
|
816
|
+
require_outputs=True,
|
|
817
|
+
stream_logs=False,
|
|
818
|
+
separate_stderr=True)
|
|
683
819
|
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
820
|
+
try:
|
|
821
|
+
subprocess_utils.handle_returncode(returncode,
|
|
822
|
+
code,
|
|
823
|
+
f'Failed to fetch {noun}s',
|
|
824
|
+
stderr,
|
|
825
|
+
stream_logs=True)
|
|
826
|
+
except exceptions.CommandError as e:
|
|
827
|
+
raise RuntimeError(e.error_msg) from e
|
|
828
|
+
|
|
829
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
692
830
|
|
|
693
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
694
831
|
# Get the endpoint for each service
|
|
695
832
|
for service_record in service_records:
|
|
696
833
|
service_record['endpoint'] = None
|
|
@@ -793,25 +930,37 @@ def _get_all_replica_targets(
|
|
|
793
930
|
handle: backends.CloudVmRayResourceHandle,
|
|
794
931
|
pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
|
|
795
932
|
"""Helper function to get targets for all live replicas."""
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
799
|
-
handle,
|
|
800
|
-
code,
|
|
801
|
-
require_outputs=True,
|
|
802
|
-
stream_logs=False,
|
|
803
|
-
separate_stderr=True)
|
|
933
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
934
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
804
935
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
936
|
+
if not use_legacy:
|
|
937
|
+
try:
|
|
938
|
+
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
939
|
+
handle, [service_name], pool)
|
|
940
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
941
|
+
use_legacy = True
|
|
942
|
+
|
|
943
|
+
if use_legacy:
|
|
944
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name],
|
|
945
|
+
pool=pool)
|
|
946
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
947
|
+
handle,
|
|
948
|
+
code,
|
|
949
|
+
require_outputs=True,
|
|
950
|
+
stream_logs=False,
|
|
951
|
+
separate_stderr=True)
|
|
952
|
+
|
|
953
|
+
try:
|
|
954
|
+
subprocess_utils.handle_returncode(returncode,
|
|
955
|
+
code,
|
|
956
|
+
'Failed to fetch services',
|
|
957
|
+
stderr,
|
|
958
|
+
stream_logs=True)
|
|
959
|
+
except exceptions.CommandError as e:
|
|
960
|
+
raise RuntimeError(e.error_msg) from e
|
|
961
|
+
|
|
962
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
813
963
|
|
|
814
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
815
964
|
if not service_records:
|
|
816
965
|
raise ValueError(f'Service {service_name!r} not found.')
|
|
817
966
|
assert len(service_records) == 1
|