skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -10,12 +10,11 @@ import pickle
|
|
|
10
10
|
import re
|
|
11
11
|
import shlex
|
|
12
12
|
import shutil
|
|
13
|
-
import threading
|
|
14
13
|
import time
|
|
15
14
|
import traceback
|
|
16
15
|
import typing
|
|
17
|
-
from typing import (Any, Callable, DefaultDict, Deque, Dict,
|
|
18
|
-
|
|
16
|
+
from typing import (Any, Callable, DefaultDict, Deque, Dict, Iterator, List,
|
|
17
|
+
Optional, TextIO, Type, Union)
|
|
19
18
|
import uuid
|
|
20
19
|
|
|
21
20
|
import colorama
|
|
@@ -158,50 +157,6 @@ _SIGNAL_TO_ERROR = {
|
|
|
158
157
|
UserSignal.TERMINATE: exceptions.ServeUserTerminatedError,
|
|
159
158
|
}
|
|
160
159
|
|
|
161
|
-
# pylint: disable=invalid-name
|
|
162
|
-
KeyType = TypeVar('KeyType')
|
|
163
|
-
ValueType = TypeVar('ValueType')
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
# Google style guide: Do not rely on the atomicity of built-in types.
|
|
167
|
-
# Our launch and down process pool will be used by multiple threads,
|
|
168
|
-
# therefore we need to use a thread-safe dict.
|
|
169
|
-
# see https://google.github.io/styleguide/pyguide.html#218-threading
|
|
170
|
-
class ThreadSafeDict(Generic[KeyType, ValueType]):
|
|
171
|
-
"""A thread-safe dict."""
|
|
172
|
-
|
|
173
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
174
|
-
self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
|
|
175
|
-
self._lock = threading.Lock()
|
|
176
|
-
|
|
177
|
-
def __getitem__(self, key: KeyType) -> ValueType:
|
|
178
|
-
with self._lock:
|
|
179
|
-
return self._dict.__getitem__(key)
|
|
180
|
-
|
|
181
|
-
def __setitem__(self, key: KeyType, value: ValueType) -> None:
|
|
182
|
-
with self._lock:
|
|
183
|
-
return self._dict.__setitem__(key, value)
|
|
184
|
-
|
|
185
|
-
def __delitem__(self, key: KeyType) -> None:
|
|
186
|
-
with self._lock:
|
|
187
|
-
return self._dict.__delitem__(key)
|
|
188
|
-
|
|
189
|
-
def __len__(self) -> int:
|
|
190
|
-
with self._lock:
|
|
191
|
-
return self._dict.__len__()
|
|
192
|
-
|
|
193
|
-
def __contains__(self, key: KeyType) -> bool:
|
|
194
|
-
with self._lock:
|
|
195
|
-
return self._dict.__contains__(key)
|
|
196
|
-
|
|
197
|
-
def items(self):
|
|
198
|
-
with self._lock:
|
|
199
|
-
return self._dict.items()
|
|
200
|
-
|
|
201
|
-
def values(self):
|
|
202
|
-
with self._lock:
|
|
203
|
-
return self._dict.values()
|
|
204
|
-
|
|
205
160
|
|
|
206
161
|
class RequestsAggregator:
|
|
207
162
|
"""Base class for request aggregator."""
|
|
@@ -262,26 +217,24 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
|
262
217
|
controller = controller_utils.get_controller_for_pool(pool).value
|
|
263
218
|
if current_is_consolidation_mode:
|
|
264
219
|
controller_cn = controller.cluster_name
|
|
265
|
-
if global_user_state.
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
f'{colorama.Style.RESET_ALL}')
|
|
220
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
221
|
+
logger.warning(
|
|
222
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
223
|
+
f'{controller.controller_type} is enabled, but the controller '
|
|
224
|
+
f'cluster {controller_cn} is still running. Please terminate '
|
|
225
|
+
'the controller cluster first.'
|
|
226
|
+
f'{colorama.Style.RESET_ALL}')
|
|
273
227
|
else:
|
|
274
228
|
noun = 'pool' if pool else 'service'
|
|
275
229
|
all_services = [
|
|
276
230
|
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
277
231
|
]
|
|
278
232
|
if all_services:
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
233
|
+
logger.warning(
|
|
234
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
235
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
236
|
+
f'still {len(all_services)} {noun}s running. Please terminate '
|
|
237
|
+
f'those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
285
238
|
|
|
286
239
|
|
|
287
240
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
@@ -291,6 +244,10 @@ def is_consolidation_mode(pool: bool = False) -> bool:
|
|
|
291
244
|
consolidation_mode = skypilot_config.get_nested(
|
|
292
245
|
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
293
246
|
default_value=False)
|
|
247
|
+
if os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
248
|
+
# if we are in the job controller, we must always be in consolidation
|
|
249
|
+
# mode.
|
|
250
|
+
return True
|
|
294
251
|
# We should only do this check on API server, as the controller will not
|
|
295
252
|
# have related config and will always seemingly disabled for consolidation
|
|
296
253
|
# mode. Check #6611 for more details.
|
|
@@ -397,12 +354,28 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
|
397
354
|
if task.service.dynamic_ondemand_fallback else 'spot')
|
|
398
355
|
for resource in list(task.resources):
|
|
399
356
|
if resource.job_recovery is not None:
|
|
400
|
-
sys_name = 'SkyServe' if not pool else '
|
|
357
|
+
sys_name = 'SkyServe' if not pool else 'Pool'
|
|
401
358
|
with ux_utils.print_exception_no_traceback():
|
|
402
359
|
raise ValueError(f'job_recovery is disabled for {sys_name}. '
|
|
403
360
|
f'{sys_name} will replenish preempted spot '
|
|
404
361
|
f'with {policy_description} instances.')
|
|
405
362
|
|
|
363
|
+
if pool:
|
|
364
|
+
accelerators = set()
|
|
365
|
+
for resource in task.resources:
|
|
366
|
+
if resource.accelerators is not None:
|
|
367
|
+
if isinstance(resource.accelerators, str):
|
|
368
|
+
accelerators.add(resource.accelerators)
|
|
369
|
+
elif isinstance(resource.accelerators, dict):
|
|
370
|
+
accelerators.update(resource.accelerators.keys())
|
|
371
|
+
elif isinstance(resource.accelerators, list):
|
|
372
|
+
accelerators.update(resource.accelerators)
|
|
373
|
+
if len(accelerators) > 1:
|
|
374
|
+
with ux_utils.print_exception_no_traceback():
|
|
375
|
+
raise ValueError('Heterogeneous clusters are not supported for '
|
|
376
|
+
'pools please specify one accelerator '
|
|
377
|
+
'for all workers.')
|
|
378
|
+
|
|
406
379
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
407
380
|
# is valid for spot placer.
|
|
408
381
|
spot_placer.SpotPlacer.from_task(task.service, task)
|
|
@@ -447,7 +420,7 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
|
447
420
|
if (task.service.ports is not None or
|
|
448
421
|
requested_resources.ports is not None):
|
|
449
422
|
with ux_utils.print_exception_no_traceback():
|
|
450
|
-
raise ValueError('Cannot specify ports in a
|
|
423
|
+
raise ValueError('Cannot specify ports in a pool.')
|
|
451
424
|
|
|
452
425
|
|
|
453
426
|
def generate_service_name(pool: bool = False):
|
|
@@ -675,6 +648,18 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
|
|
675
648
|
return message
|
|
676
649
|
|
|
677
650
|
|
|
651
|
+
def get_yaml_content(service_name: str, version: int) -> str:
|
|
652
|
+
yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
653
|
+
if yaml_content is not None:
|
|
654
|
+
return yaml_content
|
|
655
|
+
# Backward compatibility for old service records that
|
|
656
|
+
# does not dump the yaml content to version database.
|
|
657
|
+
# TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
|
|
658
|
+
latest_yaml_path = generate_task_yaml_file_name(service_name, version)
|
|
659
|
+
with open(latest_yaml_path, 'r', encoding='utf-8') as f:
|
|
660
|
+
return f.read()
|
|
661
|
+
|
|
662
|
+
|
|
678
663
|
def _get_service_status(
|
|
679
664
|
service_name: str,
|
|
680
665
|
pool: bool,
|
|
@@ -697,21 +682,30 @@ def _get_service_status(
|
|
|
697
682
|
|
|
698
683
|
record['pool_yaml'] = ''
|
|
699
684
|
if record['pool']:
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
#
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
original_config['pool'] = svc # Add pool to root config
|
|
685
|
+
version = record['version']
|
|
686
|
+
try:
|
|
687
|
+
yaml_content = get_yaml_content(service_name, version)
|
|
688
|
+
raw_yaml_config = yaml_utils.read_yaml_str(yaml_content)
|
|
689
|
+
except Exception as e: # pylint: disable=broad-except
|
|
690
|
+
# If this is a consolidation mode running without an PVC, the file
|
|
691
|
+
# might lost after an API server update (restart). In such case, we
|
|
692
|
+
# don't want it to crash the command. Fall back to an empty string.
|
|
693
|
+
logger.error(f'Failed to read YAML for service {service_name} '
|
|
694
|
+
f'with version {version}: {e}')
|
|
695
|
+
record['pool_yaml'] = ''
|
|
712
696
|
else:
|
|
713
|
-
original_config =
|
|
714
|
-
|
|
697
|
+
original_config = raw_yaml_config.get('_user_specified_yaml')
|
|
698
|
+
if original_config is None:
|
|
699
|
+
# Fall back to old display format.
|
|
700
|
+
original_config = raw_yaml_config
|
|
701
|
+
original_config.pop('run', None)
|
|
702
|
+
svc: Dict[str, Any] = original_config.pop('service')
|
|
703
|
+
if svc is not None:
|
|
704
|
+
svc.pop('pool', None) # Remove pool from service config
|
|
705
|
+
original_config['pool'] = svc # Add pool to root config
|
|
706
|
+
else:
|
|
707
|
+
original_config = yaml_utils.safe_load(original_config)
|
|
708
|
+
record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
|
|
715
709
|
|
|
716
710
|
record['target_num_replicas'] = 0
|
|
717
711
|
try:
|
|
@@ -740,8 +734,8 @@ def _get_service_status(
|
|
|
740
734
|
return record
|
|
741
735
|
|
|
742
736
|
|
|
743
|
-
def
|
|
744
|
-
pool: bool) -> str:
|
|
737
|
+
def get_service_status_pickled(service_names: Optional[List[str]],
|
|
738
|
+
pool: bool) -> List[Dict[str, str]]:
|
|
745
739
|
service_statuses: List[Dict[str, str]] = []
|
|
746
740
|
if service_names is None:
|
|
747
741
|
# Get all service names
|
|
@@ -754,14 +748,34 @@ def get_service_status_encoded(service_names: Optional[List[str]],
|
|
|
754
748
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
|
755
749
|
for k, v in service_status.items()
|
|
756
750
|
})
|
|
757
|
-
|
|
751
|
+
return sorted(service_statuses, key=lambda x: x['name'])
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
755
|
+
def get_service_status_encoded(service_names: Optional[List[str]],
|
|
756
|
+
pool: bool) -> str:
|
|
758
757
|
# We have to use payload_type here to avoid the issue of
|
|
759
758
|
# message_utils.decode_payload() not being able to correctly decode the
|
|
760
759
|
# message with <sky-payload> tags.
|
|
760
|
+
service_statuses = get_service_status_pickled(service_names, pool)
|
|
761
761
|
return message_utils.encode_payload(service_statuses,
|
|
762
762
|
payload_type='service_status')
|
|
763
763
|
|
|
764
764
|
|
|
765
|
+
def unpickle_service_status(
|
|
766
|
+
payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
|
|
767
|
+
service_statuses: List[Dict[str, Any]] = []
|
|
768
|
+
for service_status in payload:
|
|
769
|
+
if not isinstance(service_status, dict):
|
|
770
|
+
raise ValueError(f'Invalid service status: {service_status}')
|
|
771
|
+
service_statuses.append({
|
|
772
|
+
k: pickle.loads(base64.b64decode(v))
|
|
773
|
+
for k, v in service_status.items()
|
|
774
|
+
})
|
|
775
|
+
return service_statuses
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
765
779
|
def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
766
780
|
try:
|
|
767
781
|
service_statuses_encoded = message_utils.decode_payload(
|
|
@@ -773,22 +787,16 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
|
773
787
|
service_statuses_encoded = message_utils.decode_payload(payload)
|
|
774
788
|
else:
|
|
775
789
|
raise
|
|
776
|
-
|
|
777
|
-
for service_status in service_statuses_encoded:
|
|
778
|
-
if not isinstance(service_status, dict):
|
|
779
|
-
raise ValueError(f'Invalid service status: {service_status}')
|
|
780
|
-
service_statuses.append({
|
|
781
|
-
k: pickle.loads(base64.b64decode(v))
|
|
782
|
-
for k, v in service_status.items()
|
|
783
|
-
})
|
|
784
|
-
return service_statuses
|
|
790
|
+
return unpickle_service_status(service_statuses_encoded)
|
|
785
791
|
|
|
786
792
|
|
|
793
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
787
794
|
def add_version_encoded(service_name: str) -> str:
|
|
788
795
|
new_version = serve_state.add_version(service_name)
|
|
789
796
|
return message_utils.encode_payload(new_version)
|
|
790
797
|
|
|
791
798
|
|
|
799
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
792
800
|
def load_version_string(payload: str) -> str:
|
|
793
801
|
return message_utils.decode_payload(payload)
|
|
794
802
|
|
|
@@ -821,7 +829,7 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
|
821
829
|
logger.error(f'Service {service_name!r} does not exist.')
|
|
822
830
|
return None
|
|
823
831
|
if not service_status['pool']:
|
|
824
|
-
logger.error(f'Service {service_name!r} is not a
|
|
832
|
+
logger.error(f'Service {service_name!r} is not a pool.')
|
|
825
833
|
return None
|
|
826
834
|
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
827
835
|
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
@@ -877,8 +885,8 @@ def _terminate_failed_services(
|
|
|
877
885
|
# replicas, so we don't need to try again here.
|
|
878
886
|
for replica_info in serve_state.get_replica_infos(service_name):
|
|
879
887
|
# TODO(tian): Refresh latest status of the cluster.
|
|
880
|
-
if global_user_state.
|
|
881
|
-
replica_info.cluster_name)
|
|
888
|
+
if global_user_state.cluster_with_name_exists(
|
|
889
|
+
replica_info.cluster_name):
|
|
882
890
|
remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
|
|
883
891
|
serve_state.remove_replica(service_name, replica_info.replica_id)
|
|
884
892
|
|
|
@@ -994,6 +1002,8 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
994
1002
|
Returns:
|
|
995
1003
|
Encoded load balancer port assigned to the service.
|
|
996
1004
|
"""
|
|
1005
|
+
# TODO (kyuds): when codegen is fully deprecated, return the lb port
|
|
1006
|
+
# as an int directly instead of encoding it.
|
|
997
1007
|
start_time = time.time()
|
|
998
1008
|
setup_completed = False
|
|
999
1009
|
noun = 'pool' if pool else 'service'
|
|
@@ -1105,17 +1115,17 @@ def get_latest_version_with_min_replicas(
|
|
|
1105
1115
|
return active_versions[-1] if active_versions else None
|
|
1106
1116
|
|
|
1107
1117
|
|
|
1108
|
-
def _process_line(
|
|
1109
|
-
|
|
1110
|
-
|
|
1118
|
+
def _process_line(
|
|
1119
|
+
line: str,
|
|
1120
|
+
cluster_name: str,
|
|
1121
|
+
stop_on_eof: bool = False,
|
|
1122
|
+
streamed_provision_log_paths: Optional[set] = None) -> Iterator[str]:
|
|
1111
1123
|
# The line might be directing users to view logs, like
|
|
1112
1124
|
# `✓ Cluster launched: new-http. View logs at: *.log`
|
|
1113
1125
|
# We should tail the detailed logs for user.
|
|
1114
1126
|
def cluster_is_up() -> bool:
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
return False
|
|
1118
|
-
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
|
1127
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
1128
|
+
return status == status_lib.ClusterStatus.UP
|
|
1119
1129
|
|
|
1120
1130
|
provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
|
|
1121
1131
|
line)
|
|
@@ -1124,6 +1134,20 @@ def _process_line(line: str,
|
|
|
1124
1134
|
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
|
1125
1135
|
|
|
1126
1136
|
def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
|
|
1137
|
+
# Check if this provision log has already been streamed to avoid
|
|
1138
|
+
# duplicate expansion. When a Kubernetes cluster needs to pull a Docker
|
|
1139
|
+
# image, rich spinner updates can produce hundreds of lines matching
|
|
1140
|
+
# _SKYPILOT_PROVISION_LOG_CMD_PATTERN (e.g., "Launching (1 pod(s)
|
|
1141
|
+
# pending due to Pulling)... View logs: sky logs --provision ...").
|
|
1142
|
+
# Without this check, the same provision log would be expanded hundreds
|
|
1143
|
+
# of times, creating huge log files (30M+) and making users think the
|
|
1144
|
+
# system is stuck in an infinite loop.
|
|
1145
|
+
if streamed_provision_log_paths is not None:
|
|
1146
|
+
resolved_path = str(p.resolve())
|
|
1147
|
+
if resolved_path in streamed_provision_log_paths:
|
|
1148
|
+
return
|
|
1149
|
+
streamed_provision_log_paths.add(resolved_path)
|
|
1150
|
+
|
|
1127
1151
|
try:
|
|
1128
1152
|
with open(p, 'r', newline='', encoding='utf-8') as f:
|
|
1129
1153
|
# Exit if >10s without new content to avoid hanging when INIT
|
|
@@ -1195,9 +1219,14 @@ def _follow_logs_with_provision_expanding(
|
|
|
1195
1219
|
Yields:
|
|
1196
1220
|
Log lines, including expanded content from referenced provision logs.
|
|
1197
1221
|
"""
|
|
1222
|
+
streamed_provision_log_paths: set = set()
|
|
1198
1223
|
|
|
1199
1224
|
def process_line(line: str) -> Iterator[str]:
|
|
1200
|
-
yield from _process_line(
|
|
1225
|
+
yield from _process_line(
|
|
1226
|
+
line,
|
|
1227
|
+
cluster_name,
|
|
1228
|
+
stop_on_eof=stop_on_eof,
|
|
1229
|
+
streamed_provision_log_paths=streamed_provision_log_paths)
|
|
1201
1230
|
|
|
1202
1231
|
return log_utils.follow_logs(file,
|
|
1203
1232
|
should_stop=should_stop,
|
|
@@ -1223,11 +1252,14 @@ def _capped_follow_logs_with_provision_expanding(
|
|
|
1223
1252
|
Log lines, including expanded content from referenced provision logs.
|
|
1224
1253
|
"""
|
|
1225
1254
|
all_lines: Deque[str] = collections.deque(maxlen=line_cap)
|
|
1255
|
+
streamed_provision_log_paths: set = set()
|
|
1226
1256
|
|
|
1227
1257
|
for line in log_list:
|
|
1228
|
-
for processed in _process_line(
|
|
1229
|
-
|
|
1230
|
-
|
|
1258
|
+
for processed in _process_line(
|
|
1259
|
+
line=line,
|
|
1260
|
+
cluster_name=cluster_name,
|
|
1261
|
+
stop_on_eof=False,
|
|
1262
|
+
streamed_provision_log_paths=streamed_provision_log_paths):
|
|
1231
1263
|
all_lines.append(processed)
|
|
1232
1264
|
|
|
1233
1265
|
yield from all_lines
|
|
@@ -1308,10 +1340,6 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1308
1340
|
print(line, end='', flush=True)
|
|
1309
1341
|
return ''
|
|
1310
1342
|
|
|
1311
|
-
# For pools, we don't stream the job logs as the run section is ignored.
|
|
1312
|
-
if pool:
|
|
1313
|
-
return ''
|
|
1314
|
-
|
|
1315
1343
|
backend = backends.CloudVmRayBackend()
|
|
1316
1344
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
1317
1345
|
replica_cluster_name)
|
|
@@ -1519,8 +1547,15 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1519
1547
|
'handle']
|
|
1520
1548
|
if replica_handle is not None:
|
|
1521
1549
|
infra = replica_handle.launched_resources.infra.formatted_str()
|
|
1522
|
-
|
|
1523
|
-
|
|
1550
|
+
simplified = not show_all
|
|
1551
|
+
resources_str_simple, resources_str_full = (
|
|
1552
|
+
resources_utils.get_readable_resources_repr(
|
|
1553
|
+
replica_handle, simplified_only=simplified))
|
|
1554
|
+
if simplified:
|
|
1555
|
+
resources_str = resources_str_simple
|
|
1556
|
+
else:
|
|
1557
|
+
assert resources_str_full is not None
|
|
1558
|
+
resources_str = resources_str_full
|
|
1524
1559
|
|
|
1525
1560
|
replica_values = [
|
|
1526
1561
|
service_name,
|
|
@@ -1541,6 +1576,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1541
1576
|
|
|
1542
1577
|
|
|
1543
1578
|
# =========================== CodeGen for Sky Serve ===========================
|
|
1579
|
+
# TODO (kyuds): deprecate and remove serve codegen entirely.
|
|
1544
1580
|
|
|
1545
1581
|
|
|
1546
1582
|
# TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
|
sky/serve/server/core.py
CHANGED
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
5
5
|
from sky import backends
|
|
6
6
|
from sky import exceptions
|
|
7
7
|
from sky import sky_logging
|
|
8
|
+
from sky.adaptors import common as adaptors_common
|
|
8
9
|
from sky.backends import backend_utils
|
|
10
|
+
from sky.serve import serve_rpc_utils
|
|
9
11
|
from sky.serve import serve_utils
|
|
10
12
|
from sky.serve.server import impl
|
|
11
13
|
from sky.usage import usage_lib
|
|
@@ -13,7 +15,11 @@ from sky.utils import controller_utils
|
|
|
13
15
|
from sky.utils import subprocess_utils
|
|
14
16
|
|
|
15
17
|
if typing.TYPE_CHECKING:
|
|
18
|
+
import grpc
|
|
19
|
+
|
|
16
20
|
import sky
|
|
21
|
+
else:
|
|
22
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
17
23
|
|
|
18
24
|
logger = sky_logging.init_logger(__name__)
|
|
19
25
|
|
|
@@ -40,20 +46,23 @@ def up(
|
|
|
40
46
|
|
|
41
47
|
|
|
42
48
|
@usage_lib.entrypoint
|
|
43
|
-
def update(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
49
|
+
def update(task: Optional['sky.Task'],
|
|
50
|
+
service_name: str,
|
|
51
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
52
|
+
workers: Optional[int] = None) -> None:
|
|
47
53
|
"""Updates an existing service.
|
|
48
54
|
|
|
49
55
|
Please refer to the sky.cli.serve_update for the document.
|
|
50
56
|
|
|
51
57
|
Args:
|
|
52
|
-
task: sky.Task to update
|
|
58
|
+
task: sky.Task to update, or None if updating
|
|
59
|
+
the number of workers/replicas.
|
|
53
60
|
service_name: Name of the service.
|
|
54
61
|
mode: Update mode.
|
|
62
|
+
workers: Number of workers/replicas to set for the service when
|
|
63
|
+
task is None.
|
|
55
64
|
"""
|
|
56
|
-
return impl.update(task, service_name, mode, pool=False)
|
|
65
|
+
return impl.update(task, service_name, mode, pool=False, workers=workers)
|
|
57
66
|
|
|
58
67
|
|
|
59
68
|
@usage_lib.entrypoint
|
|
@@ -105,25 +114,37 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
|
|
105
114
|
'Please spin up a service first.',
|
|
106
115
|
)
|
|
107
116
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
117
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
118
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
119
|
+
|
|
120
|
+
if not use_legacy:
|
|
121
|
+
try:
|
|
122
|
+
stdout = serve_rpc_utils.RpcRunner.terminate_replica(
|
|
123
|
+
handle, service_name, replica_id, purge)
|
|
124
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
125
|
+
use_legacy = True
|
|
126
|
+
|
|
127
|
+
if use_legacy:
|
|
128
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
129
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
130
|
+
|
|
131
|
+
code = serve_utils.ServeCodeGen.terminate_replica(
|
|
132
|
+
service_name, replica_id, purge)
|
|
133
|
+
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
134
|
+
code,
|
|
135
|
+
require_outputs=True,
|
|
136
|
+
stream_logs=False,
|
|
137
|
+
separate_stderr=True)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
subprocess_utils.handle_returncode(
|
|
141
|
+
returncode,
|
|
142
|
+
code,
|
|
143
|
+
'Failed to terminate the replica',
|
|
144
|
+
stderr,
|
|
145
|
+
stream_logs=True)
|
|
146
|
+
except exceptions.CommandError as e:
|
|
147
|
+
raise RuntimeError(e.error_msg) from e
|
|
127
148
|
|
|
128
149
|
sky_logging.print(stdout)
|
|
129
150
|
|