skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/logs/agent.py
CHANGED
|
@@ -34,23 +34,50 @@ class FluentbitAgent(LoggingAgent):
|
|
|
34
34
|
def get_setup_command(self,
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
|
-
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
|
38
|
-
'sudo apt-get install -y gnupg; '
|
|
39
37
|
# pylint: disable=line-too-long
|
|
40
|
-
'
|
|
38
|
+
'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
|
|
39
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
40
|
+
# pylint: disable=line-too-long
|
|
41
|
+
'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
42
|
+
# pylint: disable=line-too-long
|
|
43
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
44
|
+
# pylint: disable=line-too-long
|
|
45
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
46
|
+
# pylint: disable=line-too-long
|
|
47
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
48
|
+
'sudo apt-get update; '
|
|
49
|
+
'sudo apt-get install -y fluent-bit; '
|
|
41
50
|
'fi')
|
|
42
51
|
cfg = self.fluentbit_config(cluster_name)
|
|
43
52
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
|
44
53
|
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
|
45
54
|
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
|
55
|
+
kill_prior_cmd = (
|
|
56
|
+
'if [ -f "/tmp/fluentbit.pid" ]; then '
|
|
57
|
+
# pylint: disable=line-too-long
|
|
58
|
+
'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
|
|
59
|
+
'kill "$(cat /tmp/fluentbit.pid)" || true; '
|
|
60
|
+
'fi')
|
|
46
61
|
start_cmd = ('nohup $(command -v fluent-bit || '
|
|
47
62
|
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
|
48
|
-
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &'
|
|
49
|
-
|
|
63
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
|
|
64
|
+
'echo $! > /tmp/fluentbit.pid')
|
|
65
|
+
return ('set -e; '
|
|
66
|
+
f'{install_cmd}; '
|
|
67
|
+
f'{config_cmd}; '
|
|
68
|
+
f'{kill_prior_cmd}; '
|
|
69
|
+
f'{start_cmd}')
|
|
50
70
|
|
|
51
71
|
def fluentbit_config(self,
|
|
52
72
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
53
73
|
cfg_dict = {
|
|
74
|
+
'parsers': [{
|
|
75
|
+
'name': 'sky-ray-parser',
|
|
76
|
+
'format': 'regex',
|
|
77
|
+
# pylint: disable=line-too-long
|
|
78
|
+
'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
|
|
79
|
+
'types': 'rank:integer pid:integer',
|
|
80
|
+
}],
|
|
54
81
|
'pipeline': {
|
|
55
82
|
'inputs': [{
|
|
56
83
|
'name': 'tail',
|
|
@@ -62,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
|
|
|
62
89
|
# right after the job completion.
|
|
63
90
|
'refresh_interval': 1,
|
|
64
91
|
}],
|
|
92
|
+
'filters': [{
|
|
93
|
+
'name': 'parser',
|
|
94
|
+
'match': '*',
|
|
95
|
+
'key_name': 'log',
|
|
96
|
+
'parser': 'sky-ray-parser',
|
|
97
|
+
'preserve_key': 'on', # preserve field for backwards compat
|
|
98
|
+
'reserve_data': 'on',
|
|
99
|
+
}],
|
|
65
100
|
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
66
101
|
}
|
|
67
102
|
}
|
sky/logs/aws.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
|
|
|
5
5
|
import pydantic
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
|
-
from sky.skylet import constants
|
|
9
8
|
from sky.utils import resources_utils
|
|
10
9
|
from sky.utils import yaml_utils
|
|
11
10
|
|
|
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
176
175
|
Returns:
|
|
177
176
|
The Fluent Bit configuration as a YAML string.
|
|
178
177
|
"""
|
|
178
|
+
cfg_dict = yaml_utils.read_yaml_str(
|
|
179
|
+
super().fluentbit_config(cluster_name))
|
|
179
180
|
display_name = cluster_name.display_name
|
|
180
181
|
unique_name = cluster_name.name_on_cloud
|
|
181
182
|
# Build tags for the log stream
|
|
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
197
198
|
'value': value
|
|
198
199
|
})
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# job creates a new log file and we must be responsive
|
|
208
|
-
# for this: the VM might be autodown within a minute
|
|
209
|
-
# right after the job completion.
|
|
210
|
-
'refresh_interval': 1,
|
|
211
|
-
'processors': {
|
|
212
|
-
'logs': log_processors,
|
|
213
|
-
}
|
|
214
|
-
}],
|
|
215
|
-
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
216
|
-
}
|
|
217
|
-
}
|
|
201
|
+
# Add log processors to config
|
|
202
|
+
processors_config = cfg_dict['pipeline']['inputs'][0].get(
|
|
203
|
+
'processors', {})
|
|
204
|
+
processors_logs_config = processors_config.get('logs', [])
|
|
205
|
+
processors_logs_config.extend(log_processors)
|
|
206
|
+
processors_config['logs'] = processors_logs_config
|
|
207
|
+
cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
|
|
218
208
|
|
|
219
209
|
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
220
210
|
|
sky/metrics/utils.py
CHANGED
|
@@ -1,11 +1,218 @@
|
|
|
1
1
|
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
6
|
+
import select
|
|
4
7
|
import subprocess
|
|
5
8
|
import time
|
|
6
9
|
from typing import List, Optional, Tuple
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky import sky_logging
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.utils import common_utils
|
|
17
|
+
from sky.utils import context_utils
|
|
18
|
+
|
|
19
|
+
_SELECT_TIMEOUT = 1
|
|
20
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
21
|
+
|
|
22
|
+
_KB = 2**10
|
|
23
|
+
_MB = 2**20
|
|
24
|
+
_MEM_BUCKETS = [
|
|
25
|
+
_KB,
|
|
26
|
+
256 * _KB,
|
|
27
|
+
512 * _KB,
|
|
28
|
+
_MB,
|
|
29
|
+
2 * _MB,
|
|
30
|
+
4 * _MB,
|
|
31
|
+
8 * _MB,
|
|
32
|
+
16 * _MB,
|
|
33
|
+
32 * _MB,
|
|
34
|
+
64 * _MB,
|
|
35
|
+
128 * _MB,
|
|
36
|
+
256 * _MB,
|
|
37
|
+
float('inf'),
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
41
|
+
|
|
42
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
43
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
44
|
+
'false').lower() == 'true'
|
|
45
|
+
|
|
46
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
47
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
48
|
+
'sky_apiserver_code_duration_seconds',
|
|
49
|
+
'Time spent processing code',
|
|
50
|
+
['name', 'group'],
|
|
51
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
52
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
53
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
54
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
55
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
56
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
57
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
58
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
59
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
63
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
64
|
+
'sky_apiserver_requests_total',
|
|
65
|
+
'Total number of API server requests',
|
|
66
|
+
['path', 'method', 'status'],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
70
|
+
# status.
|
|
71
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
72
|
+
'sky_apiserver_request_duration_seconds',
|
|
73
|
+
'Time spent processing API server requests',
|
|
74
|
+
['path', 'method', 'status'],
|
|
75
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
76
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
77
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
78
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
79
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
80
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
81
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
82
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
83
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
87
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
88
|
+
'Scheduling delay of the server event loop',
|
|
89
|
+
['pid'],
|
|
90
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
91
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
92
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
93
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
94
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
95
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
96
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
97
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
98
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
102
|
+
'sky_apiserver_websocket_connections',
|
|
103
|
+
'Number of websocket connections',
|
|
104
|
+
['pid'],
|
|
105
|
+
multiprocess_mode='livesum',
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
109
|
+
'sky_apiserver_websocket_closed_total',
|
|
110
|
+
'Number of websocket closed',
|
|
111
|
+
['pid', 'reason'],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# The number of execution starts in each worker process, we do not record
|
|
115
|
+
# histogram here as the duration has been measured in
|
|
116
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
117
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
118
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
119
|
+
'sky_apiserver_process_execution_start_total',
|
|
120
|
+
'Total number of execution starts in each worker process',
|
|
121
|
+
['request', 'pid'],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
125
|
+
'sky_apiserver_process_peak_rss',
|
|
126
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
127
|
+
['pid', 'type'],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
131
|
+
'sky_apiserver_process_cpu_total',
|
|
132
|
+
'Total CPU times a worker process has been running',
|
|
133
|
+
['pid', 'type', 'mode'],
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
137
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
138
|
+
'Peak memory usage of requests', ['name'],
|
|
139
|
+
buckets=_MEM_BUCKETS)
|
|
140
|
+
|
|
141
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
142
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
143
|
+
'RSS increment after requests', ['name'],
|
|
144
|
+
buckets=_MEM_BUCKETS)
|
|
145
|
+
|
|
146
|
+
SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
|
|
147
|
+
'sky_apiserver_websocket_ssh_latency_seconds',
|
|
148
|
+
('Time taken for ssh message to go from client to API server and back'
|
|
149
|
+
'to the client. This does not include: latency to reach the pod, '
|
|
150
|
+
'overhead from sending through the k8s port-forward tunnel, or '
|
|
151
|
+
'ssh server lag on the destination pod.'),
|
|
152
|
+
['pid'],
|
|
153
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
154
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
155
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
156
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
157
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
158
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
159
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
160
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
161
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
|
|
165
|
+
'sky_apiserver_long_executors',
|
|
166
|
+
'Total number of long-running request executors in the API server',
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
|
|
170
|
+
'sky_apiserver_short_executors',
|
|
171
|
+
'Total number of short-running request executors in the API server',
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@contextlib.contextmanager
|
|
176
|
+
def time_it(name: str, group: str = 'default'):
|
|
177
|
+
"""Context manager to measure and record code execution duration."""
|
|
178
|
+
if not METRICS_ENABLED:
|
|
179
|
+
yield
|
|
180
|
+
else:
|
|
181
|
+
start_time = time.time()
|
|
182
|
+
try:
|
|
183
|
+
yield
|
|
184
|
+
finally:
|
|
185
|
+
duration = time.time() - start_time
|
|
186
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
187
|
+
name=name, group=group).observe(duration)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def time_me(func):
|
|
191
|
+
"""Measure the duration of decorated function."""
|
|
192
|
+
|
|
193
|
+
@functools.wraps(func)
|
|
194
|
+
def wrapper(*args, **kwargs):
|
|
195
|
+
if not METRICS_ENABLED:
|
|
196
|
+
return func(*args, **kwargs)
|
|
197
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
198
|
+
with time_it(name, group='function'):
|
|
199
|
+
return func(*args, **kwargs)
|
|
200
|
+
|
|
201
|
+
return wrapper
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def time_me_async(func):
|
|
205
|
+
"""Measure the duration of decorated async function."""
|
|
206
|
+
|
|
207
|
+
@functools.wraps(func)
|
|
208
|
+
async def async_wrapper(*args, **kwargs):
|
|
209
|
+
if not METRICS_ENABLED:
|
|
210
|
+
return await func(*args, **kwargs)
|
|
211
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
212
|
+
with time_it(name, group='function'):
|
|
213
|
+
return await func(*args, **kwargs)
|
|
214
|
+
|
|
215
|
+
return async_wrapper
|
|
9
216
|
|
|
10
217
|
|
|
11
218
|
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
@@ -34,46 +241,72 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
34
241
|
if 'KUBECONFIG' not in env:
|
|
35
242
|
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
|
36
243
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
stdout=subprocess.PIPE,
|
|
40
|
-
stderr=subprocess.STDOUT,
|
|
41
|
-
text=True,
|
|
42
|
-
env=env)
|
|
43
|
-
|
|
244
|
+
port_forward_process = None
|
|
245
|
+
port_forward_exit = False
|
|
44
246
|
local_port = None
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
247
|
+
poller = None
|
|
248
|
+
fd = None
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
# start the port forward process
|
|
252
|
+
port_forward_process = subprocess.Popen(cmd,
|
|
253
|
+
stdout=subprocess.PIPE,
|
|
254
|
+
stderr=subprocess.STDOUT,
|
|
255
|
+
text=True,
|
|
256
|
+
env=env)
|
|
257
|
+
|
|
258
|
+
# Use poll() instead of select() to avoid FD_SETSIZE limit
|
|
259
|
+
poller = select.poll()
|
|
260
|
+
assert port_forward_process.stdout is not None
|
|
261
|
+
fd = port_forward_process.stdout.fileno()
|
|
262
|
+
poller.register(fd, select.POLLIN)
|
|
263
|
+
|
|
264
|
+
start_time = time.time()
|
|
265
|
+
buffer = ''
|
|
266
|
+
# wait for the port forward to start and extract the local port
|
|
267
|
+
while time.time() - start_time < start_port_forward_timeout:
|
|
268
|
+
if port_forward_process.poll() is not None:
|
|
269
|
+
# port forward process has terminated
|
|
270
|
+
if port_forward_process.returncode != 0:
|
|
271
|
+
port_forward_exit = True
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
# Wait up to 1000ms for data to be available without blocking
|
|
275
|
+
# poll() takes timeout in milliseconds
|
|
276
|
+
events = poller.poll(_SELECT_TIMEOUT * 1000)
|
|
277
|
+
|
|
278
|
+
if events:
|
|
279
|
+
# Read available bytes from the FD without blocking
|
|
280
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
281
|
+
chunk = raw.decode(errors='ignore')
|
|
282
|
+
buffer += chunk
|
|
283
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
63
284
|
if match:
|
|
64
285
|
local_port = int(match.group(1))
|
|
65
286
|
break
|
|
66
287
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
288
|
+
# sleep for 100ms to avoid busy-waiting
|
|
289
|
+
time.sleep(0.1)
|
|
290
|
+
except BaseException: # pylint: disable=broad-exception-caught
|
|
291
|
+
if port_forward_process:
|
|
292
|
+
stop_svc_port_forward(port_forward_process,
|
|
293
|
+
timeout=terminate_port_forward_timeout)
|
|
294
|
+
raise
|
|
295
|
+
finally:
|
|
296
|
+
if poller is not None and fd is not None:
|
|
297
|
+
try:
|
|
298
|
+
poller.unregister(fd)
|
|
299
|
+
except (OSError, ValueError):
|
|
300
|
+
# FD may already be unregistered or invalid
|
|
301
|
+
pass
|
|
302
|
+
if port_forward_exit:
|
|
303
|
+
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
304
|
+
f'namespace {namespace} on context {context}')
|
|
70
305
|
if local_port is None:
|
|
71
306
|
try:
|
|
72
|
-
port_forward_process
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
port_forward_process.kill()
|
|
76
|
-
port_forward_process.wait()
|
|
307
|
+
if port_forward_process:
|
|
308
|
+
stop_svc_port_forward(port_forward_process,
|
|
309
|
+
timeout=terminate_port_forward_timeout)
|
|
77
310
|
finally:
|
|
78
311
|
raise RuntimeError(
|
|
79
312
|
f'Failed to extract local port for service {service} in '
|
|
@@ -82,14 +315,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
82
315
|
return port_forward_process, local_port
|
|
83
316
|
|
|
84
317
|
|
|
85
|
-
def stop_svc_port_forward(port_forward_process: subprocess.Popen
|
|
318
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen,
|
|
319
|
+
timeout: int = 5) -> None:
|
|
86
320
|
"""Stops a port forward to a service in a Kubernetes cluster.
|
|
87
321
|
Args:
|
|
88
322
|
port_forward_process: The subprocess.Popen process to terminate
|
|
89
323
|
"""
|
|
90
324
|
try:
|
|
91
325
|
port_forward_process.terminate()
|
|
92
|
-
port_forward_process.wait(timeout=
|
|
326
|
+
port_forward_process.wait(timeout=timeout)
|
|
93
327
|
except subprocess.TimeoutExpired:
|
|
94
328
|
port_forward_process.kill()
|
|
95
329
|
port_forward_process.wait()
|
|
@@ -122,8 +356,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
122
356
|
port_forward_process = None
|
|
123
357
|
try:
|
|
124
358
|
# Start port forward
|
|
125
|
-
port_forward_process, local_port =
|
|
126
|
-
context, namespace, service, service_port)
|
|
359
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
360
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
127
361
|
|
|
128
362
|
# Build endpoint URL
|
|
129
363
|
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
@@ -140,10 +374,15 @@ async def send_metrics_request_with_port_forward(
|
|
|
140
374
|
response.raise_for_status()
|
|
141
375
|
return response.text
|
|
142
376
|
|
|
377
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
378
|
+
logger.error(f'Failed to send metrics request with port forward: '
|
|
379
|
+
f'{common_utils.format_exception(e)}')
|
|
380
|
+
raise
|
|
143
381
|
finally:
|
|
144
382
|
# Always clean up port forward
|
|
145
383
|
if port_forward_process:
|
|
146
|
-
stop_svc_port_forward
|
|
384
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
385
|
+
port_forward_process)
|
|
147
386
|
|
|
148
387
|
|
|
149
388
|
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
@@ -193,7 +432,11 @@ async def get_metrics_for_context(context: str) -> str:
|
|
|
193
432
|
"""
|
|
194
433
|
# Query both DCGM metrics and kube_pod_labels metrics
|
|
195
434
|
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
196
|
-
match_patterns = [
|
|
435
|
+
match_patterns = [
|
|
436
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
437
|
+
'kube_pod_labels',
|
|
438
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
439
|
+
]
|
|
197
440
|
|
|
198
441
|
# TODO(rohan): don't hardcode the namespace and service name
|
|
199
442
|
metrics_text = await send_metrics_request_with_port_forward(
|
sky/optimizer.py
CHANGED
|
@@ -1019,7 +1019,7 @@ class Optimizer:
|
|
|
1019
1019
|
if res.instance_type is not None
|
|
1020
1020
|
])
|
|
1021
1021
|
candidate_str = resources_utils.format_resource(
|
|
1022
|
-
best_resources,
|
|
1022
|
+
best_resources, simplified_only=True)[0]
|
|
1023
1023
|
|
|
1024
1024
|
logger.info(
|
|
1025
1025
|
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
sky/provision/__init__.py
CHANGED
|
@@ -24,8 +24,11 @@ from sky.provision import kubernetes
|
|
|
24
24
|
from sky.provision import lambda_cloud
|
|
25
25
|
from sky.provision import nebius
|
|
26
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
27
28
|
from sky.provision import runpod
|
|
28
29
|
from sky.provision import scp
|
|
30
|
+
from sky.provision import seeweb
|
|
31
|
+
from sky.provision import shadeform
|
|
29
32
|
from sky.provision import ssh
|
|
30
33
|
from sky.provision import vast
|
|
31
34
|
from sky.provision import vsphere
|
|
@@ -77,6 +80,7 @@ def query_instances(
|
|
|
77
80
|
cluster_name_on_cloud: str,
|
|
78
81
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
79
82
|
non_terminated_only: bool = True,
|
|
83
|
+
retry_if_missing: bool = False,
|
|
80
84
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
81
85
|
"""Query instances.
|
|
82
86
|
|
|
@@ -85,6 +89,11 @@ def query_instances(
|
|
|
85
89
|
|
|
86
90
|
A None status means the instance is marked as "terminated"
|
|
87
91
|
or "terminating".
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
95
|
+
cluster is not found when querying the live status on the cloud.
|
|
96
|
+
NOTE: This is currently only used on kubernetes.
|
|
88
97
|
"""
|
|
89
98
|
raise NotImplementedError
|
|
90
99
|
|
|
@@ -140,7 +149,34 @@ def get_volume_usedby(
|
|
|
140
149
|
|
|
141
150
|
|
|
142
151
|
@_route_to_cloud_impl
|
|
143
|
-
def
|
|
152
|
+
def get_all_volumes_usedby(
|
|
153
|
+
provider_name: str, configs: List[models.VolumeConfig]
|
|
154
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
155
|
+
"""Get the usedby of a volume.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
usedby_pods: List of dictionaries, each containing the config keys for
|
|
159
|
+
a volume and a key containing pods using the volume.
|
|
160
|
+
These may include pods not created by SkyPilot.
|
|
161
|
+
usedby_clusters: List of dictionaries, each containing the config keys
|
|
162
|
+
for a volume and a key containing clusters using
|
|
163
|
+
the volume.
|
|
164
|
+
"""
|
|
165
|
+
raise NotImplementedError
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@_route_to_cloud_impl
|
|
169
|
+
def map_all_volumes_usedby(
|
|
170
|
+
provider_name: str, used_by_pods: Dict[str, Any],
|
|
171
|
+
used_by_clusters: Dict[str, Any],
|
|
172
|
+
config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
|
|
173
|
+
"""Map the usedby resources of a volume."""
|
|
174
|
+
raise NotImplementedError
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@_route_to_cloud_impl
|
|
178
|
+
def run_instances(provider_name: str, region: str, cluster_name: str,
|
|
179
|
+
cluster_name_on_cloud: str,
|
|
144
180
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
145
181
|
"""Start instances with bootstrapped configuration."""
|
|
146
182
|
raise NotImplementedError
|
sky/provision/aws/config.py
CHANGED
|
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
305
305
|
Returns:
|
|
306
306
|
A list of route tables associated with the options VPC and region
|
|
307
307
|
"""
|
|
308
|
-
filters
|
|
308
|
+
filters: List['ec2_type_defs.FilterTypeDef'] = [{
|
|
309
|
+
'Name': 'association.main',
|
|
310
|
+
'Values': [str(main).lower()],
|
|
311
|
+
}]
|
|
309
312
|
if vpc_id is not None:
|
|
310
313
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
|
311
314
|
logger.debug(
|
|
@@ -406,10 +409,26 @@ def _usable_subnets(
|
|
|
406
409
|
s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
|
|
407
410
|
]
|
|
408
411
|
|
|
412
|
+
if not candidate_subnets:
|
|
413
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
414
|
+
'No candidate subnets found in specified VPC '
|
|
415
|
+
f'{vpc_id_of_sg}.')
|
|
416
|
+
|
|
409
417
|
available_subnets = [
|
|
410
418
|
s for s in candidate_subnets if s.state == 'available'
|
|
411
419
|
]
|
|
412
420
|
|
|
421
|
+
if not available_subnets:
|
|
422
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
423
|
+
'All candidate subnets are pending in specified VPC '
|
|
424
|
+
f'{vpc_id_of_sg}.')
|
|
425
|
+
|
|
426
|
+
if len(candidate_subnets) > len(available_subnets):
|
|
427
|
+
num_pruned = len(candidate_subnets) - len(available_subnets)
|
|
428
|
+
logger.debug(
|
|
429
|
+
f'{num_pruned} candidate subnets pruned since they are not '
|
|
430
|
+
'available.')
|
|
431
|
+
|
|
413
432
|
if use_internal_ips:
|
|
414
433
|
# Get private subnets.
|
|
415
434
|
#
|
|
@@ -421,6 +440,10 @@ def _usable_subnets(
|
|
|
421
440
|
if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
|
|
422
441
|
not s.map_public_ip_on_launch
|
|
423
442
|
]
|
|
443
|
+
if not subnets:
|
|
444
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
445
|
+
'The use_internal_ips option is set to True, but all '
|
|
446
|
+
'candidate subnets are public.')
|
|
424
447
|
else:
|
|
425
448
|
# Get public subnets.
|
|
426
449
|
#
|
|
@@ -436,6 +459,10 @@ def _usable_subnets(
|
|
|
436
459
|
s for s in available_subnets
|
|
437
460
|
if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
|
|
438
461
|
]
|
|
462
|
+
if not subnets:
|
|
463
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
464
|
+
'All candidate subnets are private, did you mean to '
|
|
465
|
+
'set use_internal_ips to True?')
|
|
439
466
|
|
|
440
467
|
subnets = sorted(
|
|
441
468
|
subnets,
|
|
@@ -449,18 +476,7 @@ def _usable_subnets(
|
|
|
449
476
|
'Failed to fetch available subnets from AWS.')
|
|
450
477
|
raise exc
|
|
451
478
|
|
|
452
|
-
if
|
|
453
|
-
vpc_msg = (f'Does a default VPC exist in region '
|
|
454
|
-
f'{ec2.meta.client.meta.region_name}? ') if (
|
|
455
|
-
vpc_id_of_sg is None) else ''
|
|
456
|
-
_skypilot_log_error_and_exit_for_failover(
|
|
457
|
-
f'No usable subnets found. {vpc_msg}'
|
|
458
|
-
'Try manually creating an instance in your specified region to '
|
|
459
|
-
'populate the list of subnets and try again. '
|
|
460
|
-
'Note that the subnet must map public IPs '
|
|
461
|
-
'on instance launch unless you set `use_internal_ips: true` in '
|
|
462
|
-
'the `provider` config.')
|
|
463
|
-
elif _are_user_subnets_pruned(subnets):
|
|
479
|
+
if _are_user_subnets_pruned(subnets):
|
|
464
480
|
_skypilot_log_error_and_exit_for_failover(
|
|
465
481
|
f'The specified subnets are not '
|
|
466
482
|
f'usable: {_get_pruned_subnets(subnets)}')
|
|
@@ -579,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
579
595
|
# not want SkyPilot to use.
|
|
580
596
|
if vpc_id_of_sg is None:
|
|
581
597
|
all_subnets = [s for s in all_subnets if s.vpc.is_default]
|
|
598
|
+
if not all_subnets:
|
|
599
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
600
|
+
f'The default VPC in {region} either does not exist or '
|
|
601
|
+
'has no subnets.')
|
|
602
|
+
|
|
582
603
|
subnets, vpc_id = _usable_subnets(
|
|
583
604
|
ec2,
|
|
584
605
|
user_specified_subnets=None,
|