skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/utils/command_runner.py
CHANGED
|
@@ -3,6 +3,7 @@ import enum
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import re
|
|
6
7
|
import shlex
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
@@ -13,6 +14,7 @@ from sky import exceptions
|
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.skylet import log_lib
|
|
17
|
+
from sky.utils import auth_utils
|
|
16
18
|
from sky.utils import common_utils
|
|
17
19
|
from sky.utils import context_utils
|
|
18
20
|
from sky.utils import control_master_utils
|
|
@@ -22,6 +24,9 @@ from sky.utils import timeline
|
|
|
22
24
|
|
|
23
25
|
logger = sky_logging.init_logger(__name__)
|
|
24
26
|
|
|
27
|
+
# Pattern to extract home directory from command output
|
|
28
|
+
_HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
|
|
29
|
+
|
|
25
30
|
# Rsync options
|
|
26
31
|
# TODO(zhwu): This will print a per-file progress bar (with -P),
|
|
27
32
|
# shooting a lot of messages to the output. --info=progress2 is used
|
|
@@ -58,6 +63,22 @@ def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
|
|
|
58
63
|
return path
|
|
59
64
|
|
|
60
65
|
|
|
66
|
+
def _is_skypilot_managed_key(key_path: str) -> bool:
|
|
67
|
+
"""Check if SSH key follows SkyPilot's managed key format.
|
|
68
|
+
|
|
69
|
+
SkyPilot-managed keys follow the pattern: ~/.sky/clients/<hash>/ssh/sky-key
|
|
70
|
+
External keys (like ~/.ssh/id_rsa) do not follow this pattern.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
key_path: Path to the SSH private key.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
True if the key follows SkyPilot's managed format, False otherwise.
|
|
77
|
+
"""
|
|
78
|
+
parts = os.path.normpath(key_path).split(os.path.sep)
|
|
79
|
+
return len(parts) >= 2 and parts[-1] == 'sky-key' and parts[-2] == 'ssh'
|
|
80
|
+
|
|
81
|
+
|
|
61
82
|
# Disable sudo for root user. This is useful when the command is running in a
|
|
62
83
|
# docker container, i.e. image_id is a docker image.
|
|
63
84
|
ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD = (
|
|
@@ -183,17 +204,25 @@ class CommandRunner:
|
|
|
183
204
|
return '-'.join(str(x) for x in self.node)
|
|
184
205
|
|
|
185
206
|
def _get_remote_home_dir(self) -> str:
|
|
186
|
-
# Use
|
|
187
|
-
#
|
|
188
|
-
#
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
207
|
+
# Use pattern matching to extract home directory.
|
|
208
|
+
# Some container images print MOTD when login shells start, which can
|
|
209
|
+
# contaminate command output. We use a unique pattern to extract the
|
|
210
|
+
# actual home directory reliably.
|
|
211
|
+
rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
|
|
212
|
+
require_outputs=True,
|
|
213
|
+
separate_stderr=True,
|
|
214
|
+
stream_logs=False)
|
|
193
215
|
if rc != 0:
|
|
194
216
|
raise ValueError('Failed to get remote home directory: '
|
|
195
|
-
f'{
|
|
196
|
-
|
|
217
|
+
f'{output + stderr}')
|
|
218
|
+
|
|
219
|
+
# Extract home directory using pattern matching
|
|
220
|
+
home_dir_match = _HOME_DIR_PATTERN.search(output)
|
|
221
|
+
if home_dir_match:
|
|
222
|
+
remote_home_dir = home_dir_match.group(1)
|
|
223
|
+
else:
|
|
224
|
+
raise ValueError('Failed to find remote home directory identifier: '
|
|
225
|
+
f'{output + stderr}')
|
|
197
226
|
return remote_home_dir
|
|
198
227
|
|
|
199
228
|
def _get_command_to_run(
|
|
@@ -414,7 +443,6 @@ class CommandRunner:
|
|
|
414
443
|
SkyPilot but we still want to get rid of some warning messages,
|
|
415
444
|
such as SSH warnings.
|
|
416
445
|
|
|
417
|
-
|
|
418
446
|
Returns:
|
|
419
447
|
returncode
|
|
420
448
|
or
|
|
@@ -469,15 +497,19 @@ class CommandRunner:
|
|
|
469
497
|
"""Close the cached connection to the remote machine."""
|
|
470
498
|
pass
|
|
471
499
|
|
|
472
|
-
def port_forward_command(
|
|
473
|
-
|
|
474
|
-
|
|
500
|
+
def port_forward_command(
|
|
501
|
+
self,
|
|
502
|
+
port_forward: List[Tuple[int, int]],
|
|
503
|
+
connect_timeout: int = 1,
|
|
504
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
475
505
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
476
506
|
|
|
477
507
|
Args:
|
|
478
508
|
port_forward: A list of ports to forward from the localhost to the
|
|
479
509
|
remote host.
|
|
480
510
|
connect_timeout: The timeout for the connection.
|
|
511
|
+
ssh_mode: The mode to use for ssh.
|
|
512
|
+
See SSHMode for more details.
|
|
481
513
|
"""
|
|
482
514
|
raise NotImplementedError
|
|
483
515
|
|
|
@@ -587,16 +619,17 @@ class SSHCommandRunner(CommandRunner):
|
|
|
587
619
|
self,
|
|
588
620
|
node: Tuple[str, int],
|
|
589
621
|
ssh_user: str,
|
|
590
|
-
ssh_private_key: str,
|
|
622
|
+
ssh_private_key: Optional[str],
|
|
591
623
|
ssh_control_name: Optional[str] = '__default__',
|
|
592
624
|
ssh_proxy_command: Optional[str] = None,
|
|
593
625
|
docker_user: Optional[str] = None,
|
|
594
626
|
disable_control_master: Optional[bool] = False,
|
|
627
|
+
port_forward_execute_remote_command: Optional[bool] = False,
|
|
595
628
|
):
|
|
596
629
|
"""Initialize SSHCommandRunner.
|
|
597
630
|
|
|
598
631
|
Example Usage:
|
|
599
|
-
runner = SSHCommandRunner(ip, ssh_user, ssh_private_key)
|
|
632
|
+
runner = SSHCommandRunner((ip, port), ssh_user, ssh_private_key)
|
|
600
633
|
runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
|
|
601
634
|
runner.rsync(source, target, up=True)
|
|
602
635
|
|
|
@@ -618,6 +651,10 @@ class SSHCommandRunner(CommandRunner):
|
|
|
618
651
|
disable_control_master: bool; specifies either or not the ssh
|
|
619
652
|
command will utilize ControlMaster. We currently disable
|
|
620
653
|
it for k8s instance.
|
|
654
|
+
port_forward_execute_remote_command: bool; specifies whether to
|
|
655
|
+
add -N to the port forwarding command. This is useful if you
|
|
656
|
+
want to run a command on the remote machine to make sure the
|
|
657
|
+
SSH tunnel is established.
|
|
621
658
|
"""
|
|
622
659
|
super().__init__(node)
|
|
623
660
|
ip, port = node
|
|
@@ -629,39 +666,72 @@ class SSHCommandRunner(CommandRunner):
|
|
|
629
666
|
self.disable_control_master = (
|
|
630
667
|
disable_control_master or
|
|
631
668
|
control_master_utils.should_disable_control_master())
|
|
669
|
+
# Ensure SSH key is available. For SkyPilot-managed keys, create from
|
|
670
|
+
# database. For external keys (e.g., Slurm clusters), verify existence.
|
|
671
|
+
if ssh_private_key is not None and _is_skypilot_managed_key(
|
|
672
|
+
ssh_private_key):
|
|
673
|
+
auth_utils.create_ssh_key_files_from_db(ssh_private_key)
|
|
674
|
+
elif ssh_private_key is not None:
|
|
675
|
+
# Externally managed key - just verify it exists
|
|
676
|
+
expanded_key_path = os.path.expanduser(ssh_private_key)
|
|
677
|
+
if not os.path.exists(expanded_key_path):
|
|
678
|
+
raise FileNotFoundError(
|
|
679
|
+
f'SSH private key not found: {expanded_key_path}')
|
|
632
680
|
if docker_user is not None:
|
|
633
681
|
assert port is None or port == 22, (
|
|
634
682
|
f'port must be None or 22 for docker_user, got {port}.')
|
|
635
|
-
#
|
|
636
|
-
|
|
637
|
-
|
|
683
|
+
# When connecting via docker, the outer SSH hop points to the
|
|
684
|
+
# container's sshd (localhost). Preserve the user proxy for the
|
|
685
|
+
# inner hop that reaches the host VM, and clear the outer proxy to
|
|
686
|
+
# avoid forwarding localhost through the jump host.
|
|
687
|
+
inner_proxy_command = ssh_proxy_command
|
|
688
|
+
inner_proxy_port = port or 22
|
|
689
|
+
self._ssh_proxy_command = None
|
|
638
690
|
self.ip = 'localhost'
|
|
639
691
|
self.ssh_user = docker_user
|
|
640
692
|
self.port = constants.DEFAULT_DOCKER_PORT
|
|
693
|
+
if inner_proxy_command is not None:
|
|
694
|
+
# Replace %h/%p placeholders with actual host values, since the
|
|
695
|
+
# final destination from the perspective of the user proxy is
|
|
696
|
+
# the host VM (ip, inner_proxy_port).
|
|
697
|
+
inner_proxy_command = inner_proxy_command.replace('%h', ip)
|
|
698
|
+
inner_proxy_command = inner_proxy_command.replace(
|
|
699
|
+
'%p', str(inner_proxy_port))
|
|
641
700
|
self._docker_ssh_proxy_command = lambda ssh: ' '.join(
|
|
642
|
-
ssh + ssh_options_list(ssh_private_key,
|
|
643
|
-
|
|
701
|
+
ssh + ssh_options_list(ssh_private_key,
|
|
702
|
+
None,
|
|
703
|
+
ssh_proxy_command=inner_proxy_command,
|
|
704
|
+
port=inner_proxy_port,
|
|
705
|
+
disable_control_master=self.
|
|
706
|
+
disable_control_master) +
|
|
707
|
+
['-W', '%h:%p', f'{ssh_user}@{ip}'])
|
|
644
708
|
else:
|
|
645
709
|
self.ip = ip
|
|
646
710
|
self.ssh_user = ssh_user
|
|
647
711
|
self.port = port
|
|
648
712
|
self._docker_ssh_proxy_command = None
|
|
713
|
+
self.port_forward_execute_remote_command = (
|
|
714
|
+
port_forward_execute_remote_command)
|
|
649
715
|
|
|
650
|
-
def port_forward_command(
|
|
651
|
-
|
|
652
|
-
|
|
716
|
+
def port_forward_command(
|
|
717
|
+
self,
|
|
718
|
+
port_forward: List[Tuple[int, int]],
|
|
719
|
+
connect_timeout: int = 1,
|
|
720
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
653
721
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
654
722
|
|
|
655
723
|
Args:
|
|
656
724
|
port_forward: A list of ports to forward from the local port to the
|
|
657
725
|
remote port.
|
|
658
726
|
connect_timeout: The timeout for the ssh connection.
|
|
727
|
+
ssh_mode: The mode to use for ssh.
|
|
728
|
+
See SSHMode for more details.
|
|
659
729
|
|
|
660
730
|
Returns:
|
|
661
731
|
The command for forwarding ports from localhost to the remote
|
|
662
732
|
machine.
|
|
663
733
|
"""
|
|
664
|
-
return self.ssh_base_command(ssh_mode=
|
|
734
|
+
return self.ssh_base_command(ssh_mode=ssh_mode,
|
|
665
735
|
port_forward=port_forward,
|
|
666
736
|
connect_timeout=connect_timeout)
|
|
667
737
|
|
|
@@ -680,7 +750,11 @@ class SSHCommandRunner(CommandRunner):
|
|
|
680
750
|
for local, remote in port_forward:
|
|
681
751
|
logger.debug(
|
|
682
752
|
f'Forwarding local port {local} to remote port {remote}.')
|
|
683
|
-
|
|
753
|
+
if self.port_forward_execute_remote_command:
|
|
754
|
+
ssh += ['-L']
|
|
755
|
+
else:
|
|
756
|
+
ssh += ['-NL']
|
|
757
|
+
ssh += [f'{local}:localhost:{remote}']
|
|
684
758
|
if self._docker_ssh_proxy_command is not None:
|
|
685
759
|
docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
|
|
686
760
|
else:
|
|
@@ -818,6 +892,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
818
892
|
log_path: str = os.devnull,
|
|
819
893
|
stream_logs: bool = True,
|
|
820
894
|
max_retry: int = 1,
|
|
895
|
+
get_remote_home_dir: Callable[[], str] = lambda: '~',
|
|
821
896
|
) -> None:
|
|
822
897
|
"""Uses 'rsync' to sync 'source' to 'target'.
|
|
823
898
|
|
|
@@ -830,6 +905,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
830
905
|
stream_logs: Stream logs to the stdout/stderr.
|
|
831
906
|
max_retry: The maximum number of retries for the rsync command.
|
|
832
907
|
This value should be non-negative.
|
|
908
|
+
get_remote_home_dir: A callable that returns the remote home
|
|
909
|
+
directory. Defaults to '~'.
|
|
833
910
|
|
|
834
911
|
Raises:
|
|
835
912
|
exceptions.CommandError: rsync command failed.
|
|
@@ -854,7 +931,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
854
931
|
rsh_option=rsh_option,
|
|
855
932
|
log_path=log_path,
|
|
856
933
|
stream_logs=stream_logs,
|
|
857
|
-
max_retry=max_retry
|
|
934
|
+
max_retry=max_retry,
|
|
935
|
+
get_remote_home_dir=get_remote_home_dir)
|
|
858
936
|
|
|
859
937
|
|
|
860
938
|
class KubernetesCommandRunner(CommandRunner):
|
|
@@ -894,9 +972,11 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
894
972
|
else:
|
|
895
973
|
return f'pod/{self.pod_name}'
|
|
896
974
|
|
|
897
|
-
def port_forward_command(
|
|
898
|
-
|
|
899
|
-
|
|
975
|
+
def port_forward_command(
|
|
976
|
+
self,
|
|
977
|
+
port_forward: List[Tuple[int, int]],
|
|
978
|
+
connect_timeout: int = 1,
|
|
979
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
900
980
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
901
981
|
|
|
902
982
|
Args:
|
|
@@ -904,14 +984,25 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
904
984
|
remote port. Currently, only one port is supported, i.e. the
|
|
905
985
|
list should have only one element.
|
|
906
986
|
connect_timeout: The timeout for the ssh connection.
|
|
987
|
+
ssh_mode: The mode to use for ssh.
|
|
988
|
+
See SSHMode for more details.
|
|
907
989
|
"""
|
|
990
|
+
del ssh_mode # unused
|
|
908
991
|
assert port_forward and len(port_forward) == 1, (
|
|
909
992
|
'Only one port is supported for Kubernetes port-forward.')
|
|
910
993
|
kubectl_args = [
|
|
911
994
|
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
|
912
995
|
]
|
|
996
|
+
# The same logic to either set `--context` to the k8s context where
|
|
997
|
+
# the sky cluster is hosted, or `--kubeconfig` to /dev/null for
|
|
998
|
+
# in-cluster k8s is used below in the `run()` method.
|
|
913
999
|
if self.context:
|
|
914
1000
|
kubectl_args += ['--context', self.context]
|
|
1001
|
+
# If context is none, it means the cluster is hosted on in-cluster k8s.
|
|
1002
|
+
# In this case, we need to set KUBECONFIG to /dev/null to avoid looking
|
|
1003
|
+
# for the cluster in whatever active context is set in the kubeconfig.
|
|
1004
|
+
else:
|
|
1005
|
+
kubectl_args += ['--kubeconfig', '/dev/null']
|
|
915
1006
|
local_port, remote_port = port_forward[0]
|
|
916
1007
|
local_port_str = f'{local_port}' if local_port is not None else ''
|
|
917
1008
|
|
|
@@ -967,7 +1058,6 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
967
1058
|
SkyPilot but we still want to get rid of some warning messages,
|
|
968
1059
|
such as SSH warnings.
|
|
969
1060
|
|
|
970
|
-
|
|
971
1061
|
Returns:
|
|
972
1062
|
returncode
|
|
973
1063
|
or
|
|
@@ -1186,3 +1276,166 @@ class LocalProcessCommandRunner(CommandRunner):
|
|
|
1186
1276
|
log_path=log_path,
|
|
1187
1277
|
stream_logs=stream_logs,
|
|
1188
1278
|
max_retry=max_retry)
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
class SlurmCommandRunner(SSHCommandRunner):
|
|
1282
|
+
"""Runner for Slurm commands.
|
|
1283
|
+
|
|
1284
|
+
SlurmCommandRunner sends commands over an SSH connection through the Slurm
|
|
1285
|
+
controller, to the virtual instances.
|
|
1286
|
+
"""
|
|
1287
|
+
|
|
1288
|
+
def __init__(
|
|
1289
|
+
self,
|
|
1290
|
+
node: Tuple[str, int],
|
|
1291
|
+
ssh_user: str,
|
|
1292
|
+
ssh_private_key: Optional[str],
|
|
1293
|
+
*,
|
|
1294
|
+
sky_dir: str,
|
|
1295
|
+
skypilot_runtime_dir: str,
|
|
1296
|
+
job_id: str,
|
|
1297
|
+
slurm_node: str,
|
|
1298
|
+
**kwargs,
|
|
1299
|
+
):
|
|
1300
|
+
"""Initialize SlurmCommandRunner.
|
|
1301
|
+
|
|
1302
|
+
Example Usage:
|
|
1303
|
+
runner = SlurmCommandRunner(
|
|
1304
|
+
(ip, port),
|
|
1305
|
+
ssh_user,
|
|
1306
|
+
ssh_private_key,
|
|
1307
|
+
sky_dir=sky_dir,
|
|
1308
|
+
skypilot_runtime_dir=skypilot_runtime_dir,
|
|
1309
|
+
job_id=job_id,
|
|
1310
|
+
slurm_node=slurm_node)
|
|
1311
|
+
runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
|
|
1312
|
+
runner.rsync(source, target, up=True)
|
|
1313
|
+
|
|
1314
|
+
Args:
|
|
1315
|
+
node: (ip, port) The IP address and port of the remote machine
|
|
1316
|
+
(login node).
|
|
1317
|
+
ssh_user: SSH username.
|
|
1318
|
+
ssh_private_key: Path to SSH private key.
|
|
1319
|
+
sky_dir: The private directory for the SkyPilot cluster on the
|
|
1320
|
+
Slurm cluster.
|
|
1321
|
+
skypilot_runtime_dir: The directory for the SkyPilot runtime
|
|
1322
|
+
on the Slurm cluster.
|
|
1323
|
+
job_id: The Slurm job ID for this instance.
|
|
1324
|
+
slurm_node: The Slurm node hostname for this instance
|
|
1325
|
+
(compute node).
|
|
1326
|
+
**kwargs: Additional arguments forwarded to SSHCommandRunner
|
|
1327
|
+
(e.g., ssh_proxy_command).
|
|
1328
|
+
"""
|
|
1329
|
+
super().__init__(node, ssh_user, ssh_private_key, **kwargs)
|
|
1330
|
+
self.sky_dir = sky_dir
|
|
1331
|
+
self.skypilot_runtime_dir = skypilot_runtime_dir
|
|
1332
|
+
self.job_id = job_id
|
|
1333
|
+
self.slurm_node = slurm_node
|
|
1334
|
+
|
|
1335
|
+
# Build a chained ProxyCommand that goes through the login node to reach
|
|
1336
|
+
# the compute node where the job is running.
|
|
1337
|
+
|
|
1338
|
+
# First, build SSH options to reach the login node, using the user's
|
|
1339
|
+
# existing proxy command if provided.
|
|
1340
|
+
proxy_ssh_options = ' '.join(
|
|
1341
|
+
ssh_options_list(self.ssh_private_key,
|
|
1342
|
+
None,
|
|
1343
|
+
ssh_proxy_command=self._ssh_proxy_command,
|
|
1344
|
+
port=self.port,
|
|
1345
|
+
disable_control_master=True))
|
|
1346
|
+
login_node_proxy_command = (f'ssh {proxy_ssh_options} '
|
|
1347
|
+
f'-W %h:%p {self.ssh_user}@{self.ip}')
|
|
1348
|
+
|
|
1349
|
+
# Update the proxy command to be the login node proxy, which will
|
|
1350
|
+
# be used by super().run() to reach the compute node.
|
|
1351
|
+
self._ssh_proxy_command = login_node_proxy_command
|
|
1352
|
+
# Update self.ip to target the compute node.
|
|
1353
|
+
self.ip = slurm_node
|
|
1354
|
+
# Assume the compute node's SSH port is 22.
|
|
1355
|
+
# TODO(kevin): Make this configurable if needed.
|
|
1356
|
+
self.port = 22
|
|
1357
|
+
|
|
1358
|
+
def rsync(
|
|
1359
|
+
self,
|
|
1360
|
+
source: str,
|
|
1361
|
+
target: str,
|
|
1362
|
+
*,
|
|
1363
|
+
up: bool,
|
|
1364
|
+
log_path: str = os.devnull,
|
|
1365
|
+
stream_logs: bool = True,
|
|
1366
|
+
max_retry: int = 1,
|
|
1367
|
+
) -> None:
|
|
1368
|
+
"""Rsyncs files directly to the Slurm compute node,
|
|
1369
|
+
by proxying through the Slurm login node.
|
|
1370
|
+
|
|
1371
|
+
For Slurm, files need to be accessible by compute nodes where jobs
|
|
1372
|
+
execute via srun. This means either it has to be on the compute node's
|
|
1373
|
+
local filesystem, or on a shared filesystem.
|
|
1374
|
+
"""
|
|
1375
|
+
# TODO(kevin): We can probably optimize this to skip the proxying
|
|
1376
|
+
# if the target dir is in a shared filesystem, since it will
|
|
1377
|
+
# be accessible by the compute node.
|
|
1378
|
+
|
|
1379
|
+
# Build SSH options for rsync using the ProxyCommand set up in __init__
|
|
1380
|
+
# to reach the compute node through the login node.
|
|
1381
|
+
ssh_options = ' '.join(
|
|
1382
|
+
ssh_options_list(
|
|
1383
|
+
# Assume nothing and rely on default SSH behavior when -i is
|
|
1384
|
+
# not specified.
|
|
1385
|
+
None,
|
|
1386
|
+
None,
|
|
1387
|
+
ssh_proxy_command=self._ssh_proxy_command,
|
|
1388
|
+
disable_control_master=True))
|
|
1389
|
+
rsh_option = f'ssh {ssh_options}'
|
|
1390
|
+
|
|
1391
|
+
self._rsync(
|
|
1392
|
+
source,
|
|
1393
|
+
target,
|
|
1394
|
+
# Compute node
|
|
1395
|
+
node_destination=f'{self.ssh_user}@{self.slurm_node}',
|
|
1396
|
+
up=up,
|
|
1397
|
+
rsh_option=rsh_option,
|
|
1398
|
+
log_path=log_path,
|
|
1399
|
+
stream_logs=stream_logs,
|
|
1400
|
+
max_retry=max_retry,
|
|
1401
|
+
get_remote_home_dir=lambda: self.sky_dir)
|
|
1402
|
+
|
|
1403
|
+
@timeline.event
|
|
1404
|
+
@context_utils.cancellation_guard
|
|
1405
|
+
def run(self, cmd: Union[str, List[str]],
|
|
1406
|
+
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1407
|
+
"""Run Slurm-supported user commands over an SSH connection.
|
|
1408
|
+
|
|
1409
|
+
Args:
|
|
1410
|
+
cmd: The Slurm-supported user command to run.
|
|
1411
|
+
|
|
1412
|
+
Returns:
|
|
1413
|
+
returncode
|
|
1414
|
+
or
|
|
1415
|
+
A tuple of (returncode, stdout, stderr).
|
|
1416
|
+
"""
|
|
1417
|
+
# Override $HOME so that each SkyPilot cluster's state is isolated
|
|
1418
|
+
# from one another. We rely on the assumption that ~ is exclusively
|
|
1419
|
+
# used by a cluster, and in Slurm that is not the case, as $HOME
|
|
1420
|
+
# could be part of a shared filesystem.
|
|
1421
|
+
# And similarly for SKY_RUNTIME_DIR. See constants.\
|
|
1422
|
+
# SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
|
|
1423
|
+
#
|
|
1424
|
+
# SSH directly to the compute node instead of using srun.
|
|
1425
|
+
# This avoids Slurm's proctrack/cgroup which kills all processes
|
|
1426
|
+
# when the job step ends (including child processes launched as
|
|
1427
|
+
# a separate process group), breaking background process spawning
|
|
1428
|
+
# (e.g., JobScheduler._run_job which uses launch_new_process_tree).
|
|
1429
|
+
# Note: proctrack/cgroup is enabled by default on Nebius'
|
|
1430
|
+
# Managed Soperator.
|
|
1431
|
+
cmd = (
|
|
1432
|
+
f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
|
|
1433
|
+
f'"{self.skypilot_runtime_dir}" && '
|
|
1434
|
+
# Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
|
|
1435
|
+
# package installation while avoiding permission conflicts when
|
|
1436
|
+
# multiple users share the same host. Otherwise it defaults to
|
|
1437
|
+
# ~/.cache/uv.
|
|
1438
|
+
f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
|
|
1439
|
+
f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
|
|
1440
|
+
|
|
1441
|
+
return super().run(cmd, **kwargs)
|
sky/utils/command_runner.pyi
CHANGED
|
@@ -6,7 +6,7 @@ determine the return type based on the value of require_outputs.
|
|
|
6
6
|
"""
|
|
7
7
|
import enum
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Iterable, List, Optional, Tuple, Union
|
|
9
|
+
from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
from typing_extensions import Literal
|
|
12
12
|
|
|
@@ -36,9 +36,9 @@ def ssh_options_list(
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
class SshMode(enum.Enum):
|
|
39
|
-
NON_INTERACTIVE
|
|
40
|
-
INTERACTIVE
|
|
41
|
-
LOGIN
|
|
39
|
+
NON_INTERACTIVE = ...
|
|
40
|
+
INTERACTIVE = ...
|
|
41
|
+
LOGIN = ...
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class CommandRunner:
|
|
@@ -106,6 +106,13 @@ class CommandRunner:
|
|
|
106
106
|
max_retry: int = ...) -> None:
|
|
107
107
|
...
|
|
108
108
|
|
|
109
|
+
def port_forward_command(
|
|
110
|
+
self,
|
|
111
|
+
port_forward: List[Tuple[int, int]],
|
|
112
|
+
connect_timeout: int = 1,
|
|
113
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
114
|
+
...
|
|
115
|
+
|
|
109
116
|
@classmethod
|
|
110
117
|
def make_runner_list(cls: typing.Type[CommandRunner],
|
|
111
118
|
node_list: Iterable[Tuple[Any, ...]],
|
|
@@ -123,19 +130,22 @@ class SSHCommandRunner(CommandRunner):
|
|
|
123
130
|
ip: str
|
|
124
131
|
port: int
|
|
125
132
|
ssh_user: str
|
|
126
|
-
ssh_private_key: str
|
|
133
|
+
ssh_private_key: Optional[str]
|
|
127
134
|
ssh_control_name: Optional[str]
|
|
128
135
|
docker_user: str
|
|
129
136
|
disable_control_master: Optional[bool]
|
|
137
|
+
port_forward_execute_remote_command: Optional[bool]
|
|
130
138
|
|
|
131
139
|
def __init__(
|
|
132
140
|
self,
|
|
133
141
|
node: Tuple[str, int],
|
|
134
142
|
ssh_user: str,
|
|
135
|
-
ssh_private_key: str,
|
|
143
|
+
ssh_private_key: Optional[str],
|
|
136
144
|
ssh_control_name: Optional[str] = ...,
|
|
145
|
+
ssh_proxy_command: Optional[str] = ...,
|
|
137
146
|
docker_user: Optional[str] = ...,
|
|
138
147
|
disable_control_master: Optional[bool] = ...,
|
|
148
|
+
port_forward_execute_remote_command: Optional[bool] = ...,
|
|
139
149
|
) -> None:
|
|
140
150
|
...
|
|
141
151
|
|
|
@@ -190,6 +200,15 @@ class SSHCommandRunner(CommandRunner):
|
|
|
190
200
|
**kwargs) -> Union[Tuple[int, str, str], int]:
|
|
191
201
|
...
|
|
192
202
|
|
|
203
|
+
def ssh_base_command(
|
|
204
|
+
self,
|
|
205
|
+
*,
|
|
206
|
+
ssh_mode: SshMode,
|
|
207
|
+
port_forward: Optional[List[Tuple[int, int]]],
|
|
208
|
+
connect_timeout: Optional[int],
|
|
209
|
+
) -> List[str]:
|
|
210
|
+
...
|
|
211
|
+
|
|
193
212
|
def rsync(self,
|
|
194
213
|
source: str,
|
|
195
214
|
target: str,
|
|
@@ -197,7 +216,15 @@ class SSHCommandRunner(CommandRunner):
|
|
|
197
216
|
up: bool,
|
|
198
217
|
log_path: str = ...,
|
|
199
218
|
stream_logs: bool = ...,
|
|
200
|
-
max_retry: int =
|
|
219
|
+
max_retry: int = ...,
|
|
220
|
+
get_remote_home_dir: Callable[[], str] = ...) -> None:
|
|
221
|
+
...
|
|
222
|
+
|
|
223
|
+
def port_forward_command(
|
|
224
|
+
self,
|
|
225
|
+
port_forward: List[Tuple[int, int]],
|
|
226
|
+
connect_timeout: int = 1,
|
|
227
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
201
228
|
...
|
|
202
229
|
|
|
203
230
|
|
|
@@ -272,6 +299,35 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
272
299
|
max_retry: int = ...) -> None:
|
|
273
300
|
...
|
|
274
301
|
|
|
302
|
+
def port_forward_command(
|
|
303
|
+
self,
|
|
304
|
+
port_forward: List[Tuple[int, int]],
|
|
305
|
+
connect_timeout: int = 1,
|
|
306
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
307
|
+
...
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class SlurmCommandRunner(SSHCommandRunner):
|
|
311
|
+
"""Runner for Slurm commands."""
|
|
312
|
+
sky_dir: str
|
|
313
|
+
skypilot_runtime_dir: str
|
|
314
|
+
job_id: str
|
|
315
|
+
slurm_node: str
|
|
316
|
+
|
|
317
|
+
def __init__(
|
|
318
|
+
self,
|
|
319
|
+
node: Tuple[str, int],
|
|
320
|
+
ssh_user: str,
|
|
321
|
+
ssh_private_key: Optional[str],
|
|
322
|
+
*,
|
|
323
|
+
sky_dir: str,
|
|
324
|
+
skypilot_runtime_dir: str,
|
|
325
|
+
job_id: str,
|
|
326
|
+
slurm_node: str,
|
|
327
|
+
**kwargs,
|
|
328
|
+
) -> None:
|
|
329
|
+
...
|
|
330
|
+
|
|
275
331
|
|
|
276
332
|
class LocalProcessCommandRunner(CommandRunner):
|
|
277
333
|
|
sky/utils/common.py
CHANGED
|
@@ -31,7 +31,7 @@ JOB_CONTROLLER_NAME: str
|
|
|
31
31
|
def refresh_server_id() -> None:
|
|
32
32
|
"""Refresh the server id.
|
|
33
33
|
|
|
34
|
-
This function is used to ensure the server id is read from the
|
|
34
|
+
This function is used to ensure the server id is read from the authoritative
|
|
35
35
|
source.
|
|
36
36
|
"""
|
|
37
37
|
global SERVER_ID
|
|
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
|
|
|
42
42
|
JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
# TODO(kevin): Remove this side effect and have callers call
|
|
46
|
+
# refresh_server_id() explicitly as needed.
|
|
45
47
|
refresh_server_id()
|
|
46
48
|
|
|
47
49
|
|