skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,6 +11,7 @@ from sky.utils import common_utils
|
|
|
11
11
|
from sky.utils import log_utils
|
|
12
12
|
from sky.utils import resources_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
14
|
+
from sky.utils import ux_utils
|
|
14
15
|
|
|
15
16
|
if typing.TYPE_CHECKING:
|
|
16
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
|
|
|
105
106
|
|
|
106
107
|
if query_clusters:
|
|
107
108
|
cluster_names = {record['name'] for record in cluster_records}
|
|
108
|
-
not_found_clusters =
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
if cluster not in cluster_names
|
|
112
|
-
]
|
|
109
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
110
|
+
query_clusters, cluster_names)
|
|
111
|
+
not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
|
|
113
112
|
if not_found_clusters:
|
|
114
113
|
cluster_str = 'Cluster'
|
|
115
114
|
if len(not_found_clusters) > 1:
|
|
@@ -283,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
|
|
|
283
282
|
if resources_str_full is not None:
|
|
284
283
|
resources_str = resources_str_full
|
|
285
284
|
if resources_str is None:
|
|
286
|
-
|
|
287
|
-
|
|
285
|
+
resources_str_simple, resources_str_full = (
|
|
286
|
+
resources_utils.get_readable_resources_repr(
|
|
287
|
+
handle, simplified_only=truncate))
|
|
288
|
+
if truncate:
|
|
289
|
+
resources_str = resources_str_simple
|
|
290
|
+
else:
|
|
291
|
+
assert resources_str_full is not None
|
|
292
|
+
resources_str = resources_str_full
|
|
288
293
|
|
|
289
294
|
return resources_str
|
|
290
295
|
return '-'
|
sky/utils/cluster_utils.py
CHANGED
|
@@ -144,6 +144,9 @@ class SSHConfigHelper(object):
|
|
|
144
144
|
username = docker_user
|
|
145
145
|
|
|
146
146
|
key_path = cls.generate_local_key_file(cluster_name, auth_config)
|
|
147
|
+
# Keep the unexpanded path for SSH config (with ~)
|
|
148
|
+
key_path_for_config = key_path
|
|
149
|
+
# Expand the path for internal operations that need absolute path
|
|
147
150
|
key_path = os.path.expanduser(key_path)
|
|
148
151
|
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
|
149
152
|
f'{cluster_name}` to remove)')
|
|
@@ -190,11 +193,29 @@ class SSHConfigHelper(object):
|
|
|
190
193
|
proxy_command = auth_config.get('ssh_proxy_command', None)
|
|
191
194
|
|
|
192
195
|
docker_proxy_command_generator = None
|
|
196
|
+
proxy_command_for_nodes = proxy_command
|
|
193
197
|
if docker_user is not None:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
|
|
199
|
+
def _docker_proxy_cmd(ip: str, port: int) -> str:
|
|
200
|
+
inner_proxy = proxy_command
|
|
201
|
+
inner_port = port or 22
|
|
202
|
+
if inner_proxy is not None:
|
|
203
|
+
inner_proxy = inner_proxy.replace('%h', ip)
|
|
204
|
+
inner_proxy = inner_proxy.replace('%p', str(inner_port))
|
|
205
|
+
return ' '.join(['ssh'] + command_runner.ssh_options_list(
|
|
206
|
+
key_path,
|
|
207
|
+
ssh_control_name=None,
|
|
208
|
+
ssh_proxy_command=inner_proxy,
|
|
209
|
+
port=inner_port,
|
|
210
|
+
# ProxyCommand (ssh -W) is a forwarding tunnel, not an
|
|
211
|
+
# interactive session. ControlMaster would cache these
|
|
212
|
+
# processes, causing them to hang and block subsequent
|
|
213
|
+
# connections. Each ProxyCommand should be ephemeral.
|
|
214
|
+
disable_control_master=True
|
|
215
|
+
) + ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
|
|
216
|
+
|
|
217
|
+
docker_proxy_command_generator = _docker_proxy_cmd
|
|
218
|
+
proxy_command_for_nodes = None
|
|
198
219
|
|
|
199
220
|
codegen = ''
|
|
200
221
|
# Add the nodes to the codegen
|
|
@@ -208,8 +229,9 @@ class SSHConfigHelper(object):
|
|
|
208
229
|
node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
|
|
209
230
|
# TODO(romilb): Update port number when k8s supports multinode
|
|
210
231
|
codegen += cls._get_generated_config(
|
|
211
|
-
sky_autogen_comment, node_name, ip, username,
|
|
212
|
-
|
|
232
|
+
sky_autogen_comment, node_name, ip, username,
|
|
233
|
+
key_path_for_config, proxy_command_for_nodes, port,
|
|
234
|
+
docker_proxy_command) + '\n'
|
|
213
235
|
|
|
214
236
|
cluster_config_path = os.path.expanduser(
|
|
215
237
|
cls.ssh_cluster_path.format(cluster_name))
|
sky/utils/command_runner.py
CHANGED
|
@@ -3,6 +3,7 @@ import enum
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import re
|
|
6
7
|
import shlex
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
@@ -13,6 +14,7 @@ from sky import exceptions
|
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.skylet import log_lib
|
|
17
|
+
from sky.utils import auth_utils
|
|
16
18
|
from sky.utils import common_utils
|
|
17
19
|
from sky.utils import context_utils
|
|
18
20
|
from sky.utils import control_master_utils
|
|
@@ -22,6 +24,9 @@ from sky.utils import timeline
|
|
|
22
24
|
|
|
23
25
|
logger = sky_logging.init_logger(__name__)
|
|
24
26
|
|
|
27
|
+
# Pattern to extract home directory from command output
|
|
28
|
+
_HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
|
|
29
|
+
|
|
25
30
|
# Rsync options
|
|
26
31
|
# TODO(zhwu): This will print a per-file progress bar (with -P),
|
|
27
32
|
# shooting a lot of messages to the output. --info=progress2 is used
|
|
@@ -183,17 +188,25 @@ class CommandRunner:
|
|
|
183
188
|
return '-'.join(str(x) for x in self.node)
|
|
184
189
|
|
|
185
190
|
def _get_remote_home_dir(self) -> str:
|
|
186
|
-
# Use
|
|
187
|
-
#
|
|
188
|
-
#
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
191
|
+
# Use pattern matching to extract home directory.
|
|
192
|
+
# Some container images print MOTD when login shells start, which can
|
|
193
|
+
# contaminate command output. We use a unique pattern to extract the
|
|
194
|
+
# actual home directory reliably.
|
|
195
|
+
rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
|
|
196
|
+
require_outputs=True,
|
|
197
|
+
separate_stderr=True,
|
|
198
|
+
stream_logs=False)
|
|
193
199
|
if rc != 0:
|
|
194
200
|
raise ValueError('Failed to get remote home directory: '
|
|
195
|
-
f'{
|
|
196
|
-
|
|
201
|
+
f'{output + stderr}')
|
|
202
|
+
|
|
203
|
+
# Extract home directory using pattern matching
|
|
204
|
+
home_dir_match = _HOME_DIR_PATTERN.search(output)
|
|
205
|
+
if home_dir_match:
|
|
206
|
+
remote_home_dir = home_dir_match.group(1)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError('Failed to find remote home directory identifier: '
|
|
209
|
+
f'{output + stderr}')
|
|
197
210
|
return remote_home_dir
|
|
198
211
|
|
|
199
212
|
def _get_command_to_run(
|
|
@@ -414,7 +427,6 @@ class CommandRunner:
|
|
|
414
427
|
SkyPilot but we still want to get rid of some warning messages,
|
|
415
428
|
such as SSH warnings.
|
|
416
429
|
|
|
417
|
-
|
|
418
430
|
Returns:
|
|
419
431
|
returncode
|
|
420
432
|
or
|
|
@@ -469,15 +481,19 @@ class CommandRunner:
|
|
|
469
481
|
"""Close the cached connection to the remote machine."""
|
|
470
482
|
pass
|
|
471
483
|
|
|
472
|
-
def port_forward_command(
|
|
473
|
-
|
|
474
|
-
|
|
484
|
+
def port_forward_command(
|
|
485
|
+
self,
|
|
486
|
+
port_forward: List[Tuple[int, int]],
|
|
487
|
+
connect_timeout: int = 1,
|
|
488
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
475
489
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
476
490
|
|
|
477
491
|
Args:
|
|
478
492
|
port_forward: A list of ports to forward from the localhost to the
|
|
479
493
|
remote host.
|
|
480
494
|
connect_timeout: The timeout for the connection.
|
|
495
|
+
ssh_mode: The mode to use for ssh.
|
|
496
|
+
See SSHMode for more details.
|
|
481
497
|
"""
|
|
482
498
|
raise NotImplementedError
|
|
483
499
|
|
|
@@ -592,6 +608,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
592
608
|
ssh_proxy_command: Optional[str] = None,
|
|
593
609
|
docker_user: Optional[str] = None,
|
|
594
610
|
disable_control_master: Optional[bool] = False,
|
|
611
|
+
port_forward_execute_remote_command: Optional[bool] = False,
|
|
595
612
|
):
|
|
596
613
|
"""Initialize SSHCommandRunner.
|
|
597
614
|
|
|
@@ -618,6 +635,10 @@ class SSHCommandRunner(CommandRunner):
|
|
|
618
635
|
disable_control_master: bool; specifies either or not the ssh
|
|
619
636
|
command will utilize ControlMaster. We currently disable
|
|
620
637
|
it for k8s instance.
|
|
638
|
+
port_forward_execute_remote_command: bool; specifies whether to
|
|
639
|
+
add -N to the port forwarding command. This is useful if you
|
|
640
|
+
want to run a command on the remote machine to make sure the
|
|
641
|
+
SSH tunnel is established.
|
|
621
642
|
"""
|
|
622
643
|
super().__init__(node)
|
|
623
644
|
ip, port = node
|
|
@@ -629,39 +650,63 @@ class SSHCommandRunner(CommandRunner):
|
|
|
629
650
|
self.disable_control_master = (
|
|
630
651
|
disable_control_master or
|
|
631
652
|
control_master_utils.should_disable_control_master())
|
|
653
|
+
# ensure the ssh key files are created from the database
|
|
654
|
+
auth_utils.create_ssh_key_files_from_db(ssh_private_key)
|
|
632
655
|
if docker_user is not None:
|
|
633
656
|
assert port is None or port == 22, (
|
|
634
657
|
f'port must be None or 22 for docker_user, got {port}.')
|
|
635
|
-
#
|
|
636
|
-
|
|
637
|
-
|
|
658
|
+
# When connecting via docker, the outer SSH hop points to the
|
|
659
|
+
# container's sshd (localhost). Preserve the user proxy for the
|
|
660
|
+
# inner hop that reaches the host VM, and clear the outer proxy to
|
|
661
|
+
# avoid forwarding localhost through the jump host.
|
|
662
|
+
inner_proxy_command = ssh_proxy_command
|
|
663
|
+
inner_proxy_port = port or 22
|
|
664
|
+
self._ssh_proxy_command = None
|
|
638
665
|
self.ip = 'localhost'
|
|
639
666
|
self.ssh_user = docker_user
|
|
640
667
|
self.port = constants.DEFAULT_DOCKER_PORT
|
|
668
|
+
if inner_proxy_command is not None:
|
|
669
|
+
# Replace %h/%p placeholders with actual host values, since the
|
|
670
|
+
# final destination from the perspective of the user proxy is
|
|
671
|
+
# the host VM (ip, inner_proxy_port).
|
|
672
|
+
inner_proxy_command = inner_proxy_command.replace('%h', ip)
|
|
673
|
+
inner_proxy_command = inner_proxy_command.replace(
|
|
674
|
+
'%p', str(inner_proxy_port))
|
|
641
675
|
self._docker_ssh_proxy_command = lambda ssh: ' '.join(
|
|
642
|
-
ssh + ssh_options_list(ssh_private_key,
|
|
643
|
-
|
|
676
|
+
ssh + ssh_options_list(ssh_private_key,
|
|
677
|
+
None,
|
|
678
|
+
ssh_proxy_command=inner_proxy_command,
|
|
679
|
+
port=inner_proxy_port,
|
|
680
|
+
disable_control_master=self.
|
|
681
|
+
disable_control_master) +
|
|
682
|
+
['-W', '%h:%p', f'{ssh_user}@{ip}'])
|
|
644
683
|
else:
|
|
645
684
|
self.ip = ip
|
|
646
685
|
self.ssh_user = ssh_user
|
|
647
686
|
self.port = port
|
|
648
687
|
self._docker_ssh_proxy_command = None
|
|
688
|
+
self.port_forward_execute_remote_command = (
|
|
689
|
+
port_forward_execute_remote_command)
|
|
649
690
|
|
|
650
|
-
def port_forward_command(
|
|
651
|
-
|
|
652
|
-
|
|
691
|
+
def port_forward_command(
|
|
692
|
+
self,
|
|
693
|
+
port_forward: List[Tuple[int, int]],
|
|
694
|
+
connect_timeout: int = 1,
|
|
695
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
653
696
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
654
697
|
|
|
655
698
|
Args:
|
|
656
699
|
port_forward: A list of ports to forward from the local port to the
|
|
657
700
|
remote port.
|
|
658
701
|
connect_timeout: The timeout for the ssh connection.
|
|
702
|
+
ssh_mode: The mode to use for ssh.
|
|
703
|
+
See SSHMode for more details.
|
|
659
704
|
|
|
660
705
|
Returns:
|
|
661
706
|
The command for forwarding ports from localhost to the remote
|
|
662
707
|
machine.
|
|
663
708
|
"""
|
|
664
|
-
return self.ssh_base_command(ssh_mode=
|
|
709
|
+
return self.ssh_base_command(ssh_mode=ssh_mode,
|
|
665
710
|
port_forward=port_forward,
|
|
666
711
|
connect_timeout=connect_timeout)
|
|
667
712
|
|
|
@@ -680,7 +725,11 @@ class SSHCommandRunner(CommandRunner):
|
|
|
680
725
|
for local, remote in port_forward:
|
|
681
726
|
logger.debug(
|
|
682
727
|
f'Forwarding local port {local} to remote port {remote}.')
|
|
683
|
-
|
|
728
|
+
if self.port_forward_execute_remote_command:
|
|
729
|
+
ssh += ['-L']
|
|
730
|
+
else:
|
|
731
|
+
ssh += ['-NL']
|
|
732
|
+
ssh += [f'{local}:localhost:{remote}']
|
|
684
733
|
if self._docker_ssh_proxy_command is not None:
|
|
685
734
|
docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
|
|
686
735
|
else:
|
|
@@ -894,9 +943,11 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
894
943
|
else:
|
|
895
944
|
return f'pod/{self.pod_name}'
|
|
896
945
|
|
|
897
|
-
def port_forward_command(
|
|
898
|
-
|
|
899
|
-
|
|
946
|
+
def port_forward_command(
|
|
947
|
+
self,
|
|
948
|
+
port_forward: List[Tuple[int, int]],
|
|
949
|
+
connect_timeout: int = 1,
|
|
950
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
900
951
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
901
952
|
|
|
902
953
|
Args:
|
|
@@ -904,14 +955,25 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
904
955
|
remote port. Currently, only one port is supported, i.e. the
|
|
905
956
|
list should have only one element.
|
|
906
957
|
connect_timeout: The timeout for the ssh connection.
|
|
958
|
+
ssh_mode: The mode to use for ssh.
|
|
959
|
+
See SSHMode for more details.
|
|
907
960
|
"""
|
|
961
|
+
del ssh_mode # unused
|
|
908
962
|
assert port_forward and len(port_forward) == 1, (
|
|
909
963
|
'Only one port is supported for Kubernetes port-forward.')
|
|
910
964
|
kubectl_args = [
|
|
911
965
|
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
|
912
966
|
]
|
|
967
|
+
# The same logic to either set `--context` to the k8s context where
|
|
968
|
+
# the sky cluster is hosted, or `--kubeconfig` to /dev/null for
|
|
969
|
+
# in-cluster k8s is used below in the `run()` method.
|
|
913
970
|
if self.context:
|
|
914
971
|
kubectl_args += ['--context', self.context]
|
|
972
|
+
# If context is none, it means the cluster is hosted on in-cluster k8s.
|
|
973
|
+
# In this case, we need to set KUBECONFIG to /dev/null to avoid looking
|
|
974
|
+
# for the cluster in whatever active context is set in the kubeconfig.
|
|
975
|
+
else:
|
|
976
|
+
kubectl_args += ['--kubeconfig', '/dev/null']
|
|
915
977
|
local_port, remote_port = port_forward[0]
|
|
916
978
|
local_port_str = f'{local_port}' if local_port is not None else ''
|
|
917
979
|
|
|
@@ -967,7 +1029,6 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
967
1029
|
SkyPilot but we still want to get rid of some warning messages,
|
|
968
1030
|
such as SSH warnings.
|
|
969
1031
|
|
|
970
|
-
|
|
971
1032
|
Returns:
|
|
972
1033
|
returncode
|
|
973
1034
|
or
|
sky/utils/command_runner.pyi
CHANGED
|
@@ -36,9 +36,9 @@ def ssh_options_list(
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
class SshMode(enum.Enum):
|
|
39
|
-
NON_INTERACTIVE
|
|
40
|
-
INTERACTIVE
|
|
41
|
-
LOGIN
|
|
39
|
+
NON_INTERACTIVE = ...
|
|
40
|
+
INTERACTIVE = ...
|
|
41
|
+
LOGIN = ...
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class CommandRunner:
|
|
@@ -106,6 +106,13 @@ class CommandRunner:
|
|
|
106
106
|
max_retry: int = ...) -> None:
|
|
107
107
|
...
|
|
108
108
|
|
|
109
|
+
def port_forward_command(
|
|
110
|
+
self,
|
|
111
|
+
port_forward: List[Tuple[int, int]],
|
|
112
|
+
connect_timeout: int = 1,
|
|
113
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
114
|
+
...
|
|
115
|
+
|
|
109
116
|
@classmethod
|
|
110
117
|
def make_runner_list(cls: typing.Type[CommandRunner],
|
|
111
118
|
node_list: Iterable[Tuple[Any, ...]],
|
|
@@ -127,6 +134,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
127
134
|
ssh_control_name: Optional[str]
|
|
128
135
|
docker_user: str
|
|
129
136
|
disable_control_master: Optional[bool]
|
|
137
|
+
port_forward_execute_remote_command: Optional[bool]
|
|
130
138
|
|
|
131
139
|
def __init__(
|
|
132
140
|
self,
|
|
@@ -134,8 +142,10 @@ class SSHCommandRunner(CommandRunner):
|
|
|
134
142
|
ssh_user: str,
|
|
135
143
|
ssh_private_key: str,
|
|
136
144
|
ssh_control_name: Optional[str] = ...,
|
|
145
|
+
ssh_proxy_command: Optional[str] = ...,
|
|
137
146
|
docker_user: Optional[str] = ...,
|
|
138
147
|
disable_control_master: Optional[bool] = ...,
|
|
148
|
+
port_forward_execute_remote_command: Optional[bool] = ...,
|
|
139
149
|
) -> None:
|
|
140
150
|
...
|
|
141
151
|
|
|
@@ -190,6 +200,15 @@ class SSHCommandRunner(CommandRunner):
|
|
|
190
200
|
**kwargs) -> Union[Tuple[int, str, str], int]:
|
|
191
201
|
...
|
|
192
202
|
|
|
203
|
+
def ssh_base_command(
|
|
204
|
+
self,
|
|
205
|
+
*,
|
|
206
|
+
ssh_mode: SshMode,
|
|
207
|
+
port_forward: Optional[List[Tuple[int, int]]],
|
|
208
|
+
connect_timeout: Optional[int],
|
|
209
|
+
) -> List[str]:
|
|
210
|
+
...
|
|
211
|
+
|
|
193
212
|
def rsync(self,
|
|
194
213
|
source: str,
|
|
195
214
|
target: str,
|
|
@@ -200,6 +219,13 @@ class SSHCommandRunner(CommandRunner):
|
|
|
200
219
|
max_retry: int = ...) -> None:
|
|
201
220
|
...
|
|
202
221
|
|
|
222
|
+
def port_forward_command(
|
|
223
|
+
self,
|
|
224
|
+
port_forward: List[Tuple[int, int]],
|
|
225
|
+
connect_timeout: int = 1,
|
|
226
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
227
|
+
...
|
|
228
|
+
|
|
203
229
|
|
|
204
230
|
class KubernetesCommandRunner(CommandRunner):
|
|
205
231
|
|
|
@@ -272,6 +298,13 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
272
298
|
max_retry: int = ...) -> None:
|
|
273
299
|
...
|
|
274
300
|
|
|
301
|
+
def port_forward_command(
|
|
302
|
+
self,
|
|
303
|
+
port_forward: List[Tuple[int, int]],
|
|
304
|
+
connect_timeout: int = 1,
|
|
305
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
306
|
+
...
|
|
307
|
+
|
|
275
308
|
|
|
276
309
|
class LocalProcessCommandRunner(CommandRunner):
|
|
277
310
|
|
sky/utils/common.py
CHANGED
|
@@ -31,7 +31,7 @@ JOB_CONTROLLER_NAME: str
|
|
|
31
31
|
def refresh_server_id() -> None:
|
|
32
32
|
"""Refresh the server id.
|
|
33
33
|
|
|
34
|
-
This function is used to ensure the server id is read from the
|
|
34
|
+
This function is used to ensure the server id is read from the authoritative
|
|
35
35
|
source.
|
|
36
36
|
"""
|
|
37
37
|
global SERVER_ID
|
|
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
|
|
|
42
42
|
JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
# TODO(kevin): Remove this side effect and have callers call
|
|
46
|
+
# refresh_server_id() explicitly as needed.
|
|
45
47
|
refresh_server_id()
|
|
46
48
|
|
|
47
49
|
|
sky/utils/common_utils.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""Utils shared between all of sky"""
|
|
2
2
|
|
|
3
|
+
import ctypes
|
|
3
4
|
import difflib
|
|
4
5
|
import enum
|
|
5
6
|
import functools
|
|
7
|
+
import gc
|
|
6
8
|
import getpass
|
|
7
9
|
import hashlib
|
|
8
10
|
import inspect
|
|
@@ -263,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
|
|
|
263
265
|
|
|
264
266
|
class Backoff:
|
|
265
267
|
"""Exponential backoff with jittering."""
|
|
266
|
-
MULTIPLIER = 1.6
|
|
267
268
|
JITTER = 0.4
|
|
268
269
|
|
|
269
|
-
def __init__(self,
|
|
270
|
+
def __init__(self,
|
|
271
|
+
initial_backoff: float = 5,
|
|
272
|
+
max_backoff_factor: int = 5,
|
|
273
|
+
multiplier: float = 1.6):
|
|
270
274
|
self._initial = True
|
|
271
275
|
self._backoff = 0.0
|
|
272
276
|
self._initial_backoff = initial_backoff
|
|
277
|
+
self._multiplier = multiplier
|
|
273
278
|
self._max_backoff = max_backoff_factor * self._initial_backoff
|
|
274
279
|
|
|
275
280
|
# https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
|
|
@@ -281,7 +286,7 @@ class Backoff:
|
|
|
281
286
|
self._initial = False
|
|
282
287
|
self._backoff = min(self._initial_backoff, self._max_backoff)
|
|
283
288
|
else:
|
|
284
|
-
self._backoff = min(self._backoff * self.
|
|
289
|
+
self._backoff = min(self._backoff * self._multiplier,
|
|
285
290
|
self._max_backoff)
|
|
286
291
|
self._backoff += random.uniform(-self.JITTER * self._backoff,
|
|
287
292
|
self.JITTER * self._backoff)
|
|
@@ -994,7 +999,17 @@ def get_mem_size_gb() -> float:
|
|
|
994
999
|
except ValueError as e:
|
|
995
1000
|
with ux_utils.print_exception_no_traceback():
|
|
996
1001
|
raise ValueError(
|
|
997
|
-
f'Failed to parse the memory size from {mem_size}'
|
|
1002
|
+
f'Failed to parse the memory size from {mem_size} (GB)'
|
|
1003
|
+
) from e
|
|
1004
|
+
mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
|
|
1005
|
+
if mem_size is not None:
|
|
1006
|
+
try:
|
|
1007
|
+
return float(mem_size) / (1024**3)
|
|
1008
|
+
except ValueError as e:
|
|
1009
|
+
with ux_utils.print_exception_no_traceback():
|
|
1010
|
+
raise ValueError(
|
|
1011
|
+
f'Failed to parse the memory size from {mem_size} (bytes)'
|
|
1012
|
+
) from e
|
|
998
1013
|
return _mem_size_gb()
|
|
999
1014
|
|
|
1000
1015
|
|
|
@@ -1090,3 +1105,21 @@ def removeprefix(string: str, prefix: str) -> str:
|
|
|
1090
1105
|
if string.startswith(prefix):
|
|
1091
1106
|
return string[len(prefix):]
|
|
1092
1107
|
return string
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
def release_memory():
|
|
1111
|
+
"""Release the process memory"""
|
|
1112
|
+
# Do the best effort to release the python heap and let malloc_trim
|
|
1113
|
+
# be more efficient.
|
|
1114
|
+
try:
|
|
1115
|
+
gc.collect()
|
|
1116
|
+
if sys.platform.startswith('linux'):
|
|
1117
|
+
# Will fail on musl (alpine), but at least it works on our
|
|
1118
|
+
# official docker images.
|
|
1119
|
+
libc = ctypes.CDLL('libc.so.6')
|
|
1120
|
+
return libc.malloc_trim(0)
|
|
1121
|
+
return 0
|
|
1122
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1123
|
+
logger.error(f'Failed to release memory: '
|
|
1124
|
+
f'{format_exception(e)}')
|
|
1125
|
+
return 0
|
sky/utils/config_utils.py
CHANGED
|
@@ -272,7 +272,7 @@ def get_cloud_config_value_from_dict(
|
|
|
272
272
|
"""
|
|
273
273
|
input_config = Config(dict_config)
|
|
274
274
|
region_key = None
|
|
275
|
-
if cloud
|
|
275
|
+
if cloud in ('kubernetes', 'ssh'):
|
|
276
276
|
region_key = 'context_configs'
|
|
277
277
|
elif cloud in _REGION_CONFIG_CLOUDS:
|
|
278
278
|
region_key = 'region_configs'
|
|
@@ -283,19 +283,6 @@ def get_cloud_config_value_from_dict(
|
|
|
283
283
|
keys=(cloud, region_key, region) + keys,
|
|
284
284
|
default_value=None,
|
|
285
285
|
override_configs=override_configs)
|
|
286
|
-
if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
|
|
287
|
-
# TODO (kyuds): Backward compatibility, remove after 0.11.0.
|
|
288
|
-
per_context_config = input_config.get_nested(
|
|
289
|
-
keys=(cloud, region) + keys,
|
|
290
|
-
default_value=None,
|
|
291
|
-
override_configs=override_configs)
|
|
292
|
-
if per_context_config is not None:
|
|
293
|
-
logger.info(
|
|
294
|
-
f'{cloud} configuration is using the legacy format. \n'
|
|
295
|
-
'This format will be deprecated after 0.11.0, refer to '
|
|
296
|
-
'`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
|
|
297
|
-
'for the new format. Please use `region_configs` to specify region specific configuration.'
|
|
298
|
-
)
|
|
299
286
|
# if no override found for specified region
|
|
300
287
|
general_config = input_config.get_nested(keys=(cloud,) + keys,
|
|
301
288
|
default_value=default_value,
|