skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/logs/agent.py
CHANGED
|
@@ -34,23 +34,50 @@ class FluentbitAgent(LoggingAgent):
|
|
|
34
34
|
def get_setup_command(self,
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
|
-
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
|
38
|
-
'sudo apt-get install -y gnupg; '
|
|
39
37
|
# pylint: disable=line-too-long
|
|
40
|
-
'
|
|
38
|
+
'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
|
|
39
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
40
|
+
# pylint: disable=line-too-long
|
|
41
|
+
'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
42
|
+
# pylint: disable=line-too-long
|
|
43
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
44
|
+
# pylint: disable=line-too-long
|
|
45
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
46
|
+
# pylint: disable=line-too-long
|
|
47
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
48
|
+
'sudo apt-get update; '
|
|
49
|
+
'sudo apt-get install -y fluent-bit; '
|
|
41
50
|
'fi')
|
|
42
51
|
cfg = self.fluentbit_config(cluster_name)
|
|
43
52
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
|
44
53
|
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
|
45
54
|
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
|
55
|
+
kill_prior_cmd = (
|
|
56
|
+
'if [ -f "/tmp/fluentbit.pid" ]; then '
|
|
57
|
+
# pylint: disable=line-too-long
|
|
58
|
+
'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
|
|
59
|
+
'kill "$(cat /tmp/fluentbit.pid)" || true; '
|
|
60
|
+
'fi')
|
|
46
61
|
start_cmd = ('nohup $(command -v fluent-bit || '
|
|
47
62
|
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
|
48
|
-
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &'
|
|
49
|
-
|
|
63
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
|
|
64
|
+
'echo $! > /tmp/fluentbit.pid')
|
|
65
|
+
return ('set -e; '
|
|
66
|
+
f'{install_cmd}; '
|
|
67
|
+
f'{config_cmd}; '
|
|
68
|
+
f'{kill_prior_cmd}; '
|
|
69
|
+
f'{start_cmd}')
|
|
50
70
|
|
|
51
71
|
def fluentbit_config(self,
|
|
52
72
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
53
73
|
cfg_dict = {
|
|
74
|
+
'parsers': [{
|
|
75
|
+
'name': 'sky-ray-parser',
|
|
76
|
+
'format': 'regex',
|
|
77
|
+
# pylint: disable=line-too-long
|
|
78
|
+
'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
|
|
79
|
+
'types': 'rank:integer pid:integer',
|
|
80
|
+
}],
|
|
54
81
|
'pipeline': {
|
|
55
82
|
'inputs': [{
|
|
56
83
|
'name': 'tail',
|
|
@@ -62,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
|
|
|
62
89
|
# right after the job completion.
|
|
63
90
|
'refresh_interval': 1,
|
|
64
91
|
}],
|
|
92
|
+
'filters': [{
|
|
93
|
+
'name': 'parser',
|
|
94
|
+
'match': '*',
|
|
95
|
+
'key_name': 'log',
|
|
96
|
+
'parser': 'sky-ray-parser',
|
|
97
|
+
'preserve_key': 'on', # preserve field for backwards compat
|
|
98
|
+
'reserve_data': 'on',
|
|
99
|
+
}],
|
|
65
100
|
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
66
101
|
}
|
|
67
102
|
}
|
sky/logs/aws.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
|
|
|
5
5
|
import pydantic
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
|
-
from sky.skylet import constants
|
|
9
8
|
from sky.utils import resources_utils
|
|
10
9
|
from sky.utils import yaml_utils
|
|
11
10
|
|
|
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
176
175
|
Returns:
|
|
177
176
|
The Fluent Bit configuration as a YAML string.
|
|
178
177
|
"""
|
|
178
|
+
cfg_dict = yaml_utils.read_yaml_str(
|
|
179
|
+
super().fluentbit_config(cluster_name))
|
|
179
180
|
display_name = cluster_name.display_name
|
|
180
181
|
unique_name = cluster_name.name_on_cloud
|
|
181
182
|
# Build tags for the log stream
|
|
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
197
198
|
'value': value
|
|
198
199
|
})
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# job creates a new log file and we must be responsive
|
|
208
|
-
# for this: the VM might be autodown within a minute
|
|
209
|
-
# right after the job completion.
|
|
210
|
-
'refresh_interval': 1,
|
|
211
|
-
'processors': {
|
|
212
|
-
'logs': log_processors,
|
|
213
|
-
}
|
|
214
|
-
}],
|
|
215
|
-
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
216
|
-
}
|
|
217
|
-
}
|
|
201
|
+
# Add log processors to config
|
|
202
|
+
processors_config = cfg_dict['pipeline']['inputs'][0].get(
|
|
203
|
+
'processors', {})
|
|
204
|
+
processors_logs_config = processors_config.get('logs', [])
|
|
205
|
+
processors_logs_config.extend(log_processors)
|
|
206
|
+
processors_config['logs'] = processors_logs_config
|
|
207
|
+
cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
|
|
218
208
|
|
|
219
209
|
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
220
210
|
|
sky/metrics/utils.py
CHANGED
|
@@ -1,11 +1,218 @@
|
|
|
1
1
|
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
6
|
+
import select
|
|
4
7
|
import subprocess
|
|
5
8
|
import time
|
|
6
9
|
from typing import List, Optional, Tuple
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky import sky_logging
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.utils import common_utils
|
|
17
|
+
from sky.utils import context_utils
|
|
18
|
+
|
|
19
|
+
_SELECT_TIMEOUT = 1
|
|
20
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
21
|
+
|
|
22
|
+
_KB = 2**10
|
|
23
|
+
_MB = 2**20
|
|
24
|
+
_MEM_BUCKETS = [
|
|
25
|
+
_KB,
|
|
26
|
+
256 * _KB,
|
|
27
|
+
512 * _KB,
|
|
28
|
+
_MB,
|
|
29
|
+
2 * _MB,
|
|
30
|
+
4 * _MB,
|
|
31
|
+
8 * _MB,
|
|
32
|
+
16 * _MB,
|
|
33
|
+
32 * _MB,
|
|
34
|
+
64 * _MB,
|
|
35
|
+
128 * _MB,
|
|
36
|
+
256 * _MB,
|
|
37
|
+
float('inf'),
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
41
|
+
|
|
42
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
43
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
44
|
+
'false').lower() == 'true'
|
|
45
|
+
|
|
46
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
47
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
48
|
+
'sky_apiserver_code_duration_seconds',
|
|
49
|
+
'Time spent processing code',
|
|
50
|
+
['name', 'group'],
|
|
51
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
52
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
53
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
54
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
55
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
56
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
57
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
58
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
59
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
63
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
64
|
+
'sky_apiserver_requests_total',
|
|
65
|
+
'Total number of API server requests',
|
|
66
|
+
['path', 'method', 'status'],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
70
|
+
# status.
|
|
71
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
72
|
+
'sky_apiserver_request_duration_seconds',
|
|
73
|
+
'Time spent processing API server requests',
|
|
74
|
+
['path', 'method', 'status'],
|
|
75
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
76
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
77
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
78
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
79
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
80
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
81
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
82
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
83
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
87
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
88
|
+
'Scheduling delay of the server event loop',
|
|
89
|
+
['pid'],
|
|
90
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
91
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
92
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
93
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
94
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
95
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
96
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
97
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
98
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
102
|
+
'sky_apiserver_websocket_connections',
|
|
103
|
+
'Number of websocket connections',
|
|
104
|
+
['pid'],
|
|
105
|
+
multiprocess_mode='livesum',
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
109
|
+
'sky_apiserver_websocket_closed_total',
|
|
110
|
+
'Number of websocket closed',
|
|
111
|
+
['pid', 'reason'],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# The number of execution starts in each worker process, we do not record
|
|
115
|
+
# histogram here as the duration has been measured in
|
|
116
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
117
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
118
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
119
|
+
'sky_apiserver_process_execution_start_total',
|
|
120
|
+
'Total number of execution starts in each worker process',
|
|
121
|
+
['request', 'pid'],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
125
|
+
'sky_apiserver_process_peak_rss',
|
|
126
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
127
|
+
['pid', 'type'],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
131
|
+
'sky_apiserver_process_cpu_total',
|
|
132
|
+
'Total CPU times a worker process has been running',
|
|
133
|
+
['pid', 'type', 'mode'],
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
137
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
138
|
+
'Peak memory usage of requests', ['name'],
|
|
139
|
+
buckets=_MEM_BUCKETS)
|
|
140
|
+
|
|
141
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
142
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
143
|
+
'RSS increment after requests', ['name'],
|
|
144
|
+
buckets=_MEM_BUCKETS)
|
|
145
|
+
|
|
146
|
+
SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
|
|
147
|
+
'sky_apiserver_websocket_ssh_latency_seconds',
|
|
148
|
+
('Time taken for ssh message to go from client to API server and back'
|
|
149
|
+
'to the client. This does not include: latency to reach the pod, '
|
|
150
|
+
'overhead from sending through the k8s port-forward tunnel, or '
|
|
151
|
+
'ssh server lag on the destination pod.'),
|
|
152
|
+
['pid'],
|
|
153
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
154
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
155
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
156
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
157
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
158
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
159
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
160
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
161
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
|
|
165
|
+
'sky_apiserver_long_executors',
|
|
166
|
+
'Total number of long-running request executors in the API server',
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
|
|
170
|
+
'sky_apiserver_short_executors',
|
|
171
|
+
'Total number of short-running request executors in the API server',
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@contextlib.contextmanager
|
|
176
|
+
def time_it(name: str, group: str = 'default'):
|
|
177
|
+
"""Context manager to measure and record code execution duration."""
|
|
178
|
+
if not METRICS_ENABLED:
|
|
179
|
+
yield
|
|
180
|
+
else:
|
|
181
|
+
start_time = time.time()
|
|
182
|
+
try:
|
|
183
|
+
yield
|
|
184
|
+
finally:
|
|
185
|
+
duration = time.time() - start_time
|
|
186
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
187
|
+
name=name, group=group).observe(duration)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def time_me(func):
|
|
191
|
+
"""Measure the duration of decorated function."""
|
|
192
|
+
|
|
193
|
+
@functools.wraps(func)
|
|
194
|
+
def wrapper(*args, **kwargs):
|
|
195
|
+
if not METRICS_ENABLED:
|
|
196
|
+
return func(*args, **kwargs)
|
|
197
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
198
|
+
with time_it(name, group='function'):
|
|
199
|
+
return func(*args, **kwargs)
|
|
200
|
+
|
|
201
|
+
return wrapper
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def time_me_async(func):
|
|
205
|
+
"""Measure the duration of decorated async function."""
|
|
206
|
+
|
|
207
|
+
@functools.wraps(func)
|
|
208
|
+
async def async_wrapper(*args, **kwargs):
|
|
209
|
+
if not METRICS_ENABLED:
|
|
210
|
+
return await func(*args, **kwargs)
|
|
211
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
212
|
+
with time_it(name, group='function'):
|
|
213
|
+
return await func(*args, **kwargs)
|
|
214
|
+
|
|
215
|
+
return async_wrapper
|
|
9
216
|
|
|
10
217
|
|
|
11
218
|
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
@@ -34,46 +241,72 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
34
241
|
if 'KUBECONFIG' not in env:
|
|
35
242
|
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
|
36
243
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
stdout=subprocess.PIPE,
|
|
40
|
-
stderr=subprocess.STDOUT,
|
|
41
|
-
text=True,
|
|
42
|
-
env=env)
|
|
43
|
-
|
|
244
|
+
port_forward_process = None
|
|
245
|
+
port_forward_exit = False
|
|
44
246
|
local_port = None
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
247
|
+
poller = None
|
|
248
|
+
fd = None
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
# start the port forward process
|
|
252
|
+
port_forward_process = subprocess.Popen(cmd,
|
|
253
|
+
stdout=subprocess.PIPE,
|
|
254
|
+
stderr=subprocess.STDOUT,
|
|
255
|
+
text=True,
|
|
256
|
+
env=env)
|
|
257
|
+
|
|
258
|
+
# Use poll() instead of select() to avoid FD_SETSIZE limit
|
|
259
|
+
poller = select.poll()
|
|
260
|
+
assert port_forward_process.stdout is not None
|
|
261
|
+
fd = port_forward_process.stdout.fileno()
|
|
262
|
+
poller.register(fd, select.POLLIN)
|
|
263
|
+
|
|
264
|
+
start_time = time.time()
|
|
265
|
+
buffer = ''
|
|
266
|
+
# wait for the port forward to start and extract the local port
|
|
267
|
+
while time.time() - start_time < start_port_forward_timeout:
|
|
268
|
+
if port_forward_process.poll() is not None:
|
|
269
|
+
# port forward process has terminated
|
|
270
|
+
if port_forward_process.returncode != 0:
|
|
271
|
+
port_forward_exit = True
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
# Wait up to 1000ms for data to be available without blocking
|
|
275
|
+
# poll() takes timeout in milliseconds
|
|
276
|
+
events = poller.poll(_SELECT_TIMEOUT * 1000)
|
|
277
|
+
|
|
278
|
+
if events:
|
|
279
|
+
# Read available bytes from the FD without blocking
|
|
280
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
281
|
+
chunk = raw.decode(errors='ignore')
|
|
282
|
+
buffer += chunk
|
|
283
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
63
284
|
if match:
|
|
64
285
|
local_port = int(match.group(1))
|
|
65
286
|
break
|
|
66
287
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
288
|
+
# sleep for 100ms to avoid busy-waiting
|
|
289
|
+
time.sleep(0.1)
|
|
290
|
+
except BaseException: # pylint: disable=broad-exception-caught
|
|
291
|
+
if port_forward_process:
|
|
292
|
+
stop_svc_port_forward(port_forward_process,
|
|
293
|
+
timeout=terminate_port_forward_timeout)
|
|
294
|
+
raise
|
|
295
|
+
finally:
|
|
296
|
+
if poller is not None and fd is not None:
|
|
297
|
+
try:
|
|
298
|
+
poller.unregister(fd)
|
|
299
|
+
except (OSError, ValueError):
|
|
300
|
+
# FD may already be unregistered or invalid
|
|
301
|
+
pass
|
|
302
|
+
if port_forward_exit:
|
|
303
|
+
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
304
|
+
f'namespace {namespace} on context {context}')
|
|
70
305
|
if local_port is None:
|
|
71
306
|
try:
|
|
72
|
-
port_forward_process
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
port_forward_process.kill()
|
|
76
|
-
port_forward_process.wait()
|
|
307
|
+
if port_forward_process:
|
|
308
|
+
stop_svc_port_forward(port_forward_process,
|
|
309
|
+
timeout=terminate_port_forward_timeout)
|
|
77
310
|
finally:
|
|
78
311
|
raise RuntimeError(
|
|
79
312
|
f'Failed to extract local port for service {service} in '
|
|
@@ -82,14 +315,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
82
315
|
return port_forward_process, local_port
|
|
83
316
|
|
|
84
317
|
|
|
85
|
-
def stop_svc_port_forward(port_forward_process: subprocess.Popen
|
|
318
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen,
|
|
319
|
+
timeout: int = 5) -> None:
|
|
86
320
|
"""Stops a port forward to a service in a Kubernetes cluster.
|
|
87
321
|
Args:
|
|
88
322
|
port_forward_process: The subprocess.Popen process to terminate
|
|
89
323
|
"""
|
|
90
324
|
try:
|
|
91
325
|
port_forward_process.terminate()
|
|
92
|
-
port_forward_process.wait(timeout=
|
|
326
|
+
port_forward_process.wait(timeout=timeout)
|
|
93
327
|
except subprocess.TimeoutExpired:
|
|
94
328
|
port_forward_process.kill()
|
|
95
329
|
port_forward_process.wait()
|
|
@@ -122,8 +356,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
122
356
|
port_forward_process = None
|
|
123
357
|
try:
|
|
124
358
|
# Start port forward
|
|
125
|
-
port_forward_process, local_port =
|
|
126
|
-
context, namespace, service, service_port)
|
|
359
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
360
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
127
361
|
|
|
128
362
|
# Build endpoint URL
|
|
129
363
|
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
@@ -140,10 +374,15 @@ async def send_metrics_request_with_port_forward(
|
|
|
140
374
|
response.raise_for_status()
|
|
141
375
|
return response.text
|
|
142
376
|
|
|
377
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
378
|
+
logger.error(f'Failed to send metrics request with port forward: '
|
|
379
|
+
f'{common_utils.format_exception(e)}')
|
|
380
|
+
raise
|
|
143
381
|
finally:
|
|
144
382
|
# Always clean up port forward
|
|
145
383
|
if port_forward_process:
|
|
146
|
-
stop_svc_port_forward
|
|
384
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
385
|
+
port_forward_process)
|
|
147
386
|
|
|
148
387
|
|
|
149
388
|
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
@@ -193,7 +432,11 @@ async def get_metrics_for_context(context: str) -> str:
|
|
|
193
432
|
"""
|
|
194
433
|
# Query both DCGM metrics and kube_pod_labels metrics
|
|
195
434
|
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
196
|
-
match_patterns = [
|
|
435
|
+
match_patterns = [
|
|
436
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
437
|
+
'kube_pod_labels',
|
|
438
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
439
|
+
]
|
|
197
440
|
|
|
198
441
|
# TODO(rohan): don't hardcode the namespace and service name
|
|
199
442
|
metrics_text = await send_metrics_request_with_port_forward(
|
sky/models.py
CHANGED
|
@@ -68,6 +68,8 @@ class KubernetesNodeInfo:
|
|
|
68
68
|
free: Dict[str, int]
|
|
69
69
|
# IP address of the node (external IP preferred, fallback to internal IP)
|
|
70
70
|
ip_address: Optional[str] = None
|
|
71
|
+
# Whether the node is ready (all conditions are satisfied)
|
|
72
|
+
is_ready: bool = True
|
|
71
73
|
|
|
72
74
|
|
|
73
75
|
@dataclasses.dataclass
|
sky/optimizer.py
CHANGED
|
@@ -781,7 +781,7 @@ class Optimizer:
|
|
|
781
781
|
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
|
782
782
|
instance_type = resources.instance_type
|
|
783
783
|
assert instance_type is not None, 'Instance type must be specified'
|
|
784
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
784
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
785
785
|
instance_type = '-'
|
|
786
786
|
if resources.use_spot:
|
|
787
787
|
instance_type = ''
|
|
@@ -865,11 +865,12 @@ class Optimizer:
|
|
|
865
865
|
'use_spot': resources.use_spot
|
|
866
866
|
}
|
|
867
867
|
|
|
868
|
-
# Handle special case for Kubernetes and
|
|
869
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
868
|
+
# Handle special case for Kubernetes, SSH, and SLURM clouds
|
|
869
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
870
870
|
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
|
871
|
-
# context name, i.e. different Kubernetes clusters.
|
|
872
|
-
#
|
|
871
|
+
# context name, i.e. different Kubernetes clusters.
|
|
872
|
+
# Region for SLURM is the cluster name.
|
|
873
|
+
# We add region to the key to show all the clusters in the
|
|
873
874
|
# optimizer table for better UX.
|
|
874
875
|
|
|
875
876
|
if resources.cloud.__class__.__name__ == 'SSH':
|
|
@@ -1019,7 +1020,7 @@ class Optimizer:
|
|
|
1019
1020
|
if res.instance_type is not None
|
|
1020
1021
|
])
|
|
1021
1022
|
candidate_str = resources_utils.format_resource(
|
|
1022
|
-
best_resources,
|
|
1023
|
+
best_resources, simplified_only=True)[0]
|
|
1023
1024
|
|
|
1024
1025
|
logger.info(
|
|
1025
1026
|
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
sky/provision/__init__.py
CHANGED
|
@@ -24,8 +24,12 @@ from sky.provision import kubernetes
|
|
|
24
24
|
from sky.provision import lambda_cloud
|
|
25
25
|
from sky.provision import nebius
|
|
26
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
27
28
|
from sky.provision import runpod
|
|
28
29
|
from sky.provision import scp
|
|
30
|
+
from sky.provision import seeweb
|
|
31
|
+
from sky.provision import shadeform
|
|
32
|
+
from sky.provision import slurm
|
|
29
33
|
from sky.provision import ssh
|
|
30
34
|
from sky.provision import vast
|
|
31
35
|
from sky.provision import vsphere
|
|
@@ -77,6 +81,7 @@ def query_instances(
|
|
|
77
81
|
cluster_name_on_cloud: str,
|
|
78
82
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
79
83
|
non_terminated_only: bool = True,
|
|
84
|
+
retry_if_missing: bool = False,
|
|
80
85
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
81
86
|
"""Query instances.
|
|
82
87
|
|
|
@@ -85,6 +90,11 @@ def query_instances(
|
|
|
85
90
|
|
|
86
91
|
A None status means the instance is marked as "terminated"
|
|
87
92
|
or "terminating".
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
96
|
+
cluster is not found when querying the live status on the cloud.
|
|
97
|
+
NOTE: This is currently only used on kubernetes.
|
|
88
98
|
"""
|
|
89
99
|
raise NotImplementedError
|
|
90
100
|
|
|
@@ -140,7 +150,34 @@ def get_volume_usedby(
|
|
|
140
150
|
|
|
141
151
|
|
|
142
152
|
@_route_to_cloud_impl
|
|
143
|
-
def
|
|
153
|
+
def get_all_volumes_usedby(
|
|
154
|
+
provider_name: str, configs: List[models.VolumeConfig]
|
|
155
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
156
|
+
"""Get the usedby of a volume.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
usedby_pods: List of dictionaries, each containing the config keys for
|
|
160
|
+
a volume and a key containing pods using the volume.
|
|
161
|
+
These may include pods not created by SkyPilot.
|
|
162
|
+
usedby_clusters: List of dictionaries, each containing the config keys
|
|
163
|
+
for a volume and a key containing clusters using
|
|
164
|
+
the volume.
|
|
165
|
+
"""
|
|
166
|
+
raise NotImplementedError
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@_route_to_cloud_impl
|
|
170
|
+
def map_all_volumes_usedby(
|
|
171
|
+
provider_name: str, used_by_pods: Dict[str, Any],
|
|
172
|
+
used_by_clusters: Dict[str, Any],
|
|
173
|
+
config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
|
|
174
|
+
"""Map the usedby resources of a volume."""
|
|
175
|
+
raise NotImplementedError
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@_route_to_cloud_impl
|
|
179
|
+
def run_instances(provider_name: str, region: str, cluster_name: str,
|
|
180
|
+
cluster_name_on_cloud: str,
|
|
144
181
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
145
182
|
"""Start instances with bootstrapped configuration."""
|
|
146
183
|
raise NotImplementedError
|