skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
cluster_name: {{ cluster_name_on_cloud }}
|
|
2
|
+
|
|
3
|
+
max_workers: {{ num_nodes - 1 }}
|
|
4
|
+
upscaling_speed: {{ num_nodes - 1 }}
|
|
5
|
+
idle_timeout_minutes: 5
|
|
6
|
+
|
|
7
|
+
{%- if docker_image is not none %}
|
|
8
|
+
docker:
|
|
9
|
+
image: {{docker_image}}
|
|
10
|
+
container_name: {{docker_container_name}}
|
|
11
|
+
run_options:
|
|
12
|
+
- --ulimit nofile=1048576:1048576
|
|
13
|
+
{%- for run_option in docker_run_options %}
|
|
14
|
+
- {{run_option}}
|
|
15
|
+
{%- endfor %}
|
|
16
|
+
{%- if docker_login_config is not none %}
|
|
17
|
+
docker_login_config:
|
|
18
|
+
username: |-
|
|
19
|
+
{{docker_login_config.username}}
|
|
20
|
+
password: |-
|
|
21
|
+
{{docker_login_config.password | indent(6) }}
|
|
22
|
+
server: |-
|
|
23
|
+
{{docker_login_config.server}}
|
|
24
|
+
{%- endif %}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
|
|
27
|
+
provider:
|
|
28
|
+
type: external
|
|
29
|
+
module: sky.provision.seeweb
|
|
30
|
+
region: "{{ region }}"
|
|
31
|
+
|
|
32
|
+
auth:
|
|
33
|
+
ssh_user: ecuser
|
|
34
|
+
ssh_private_key: {{ ssh_private_key }}
|
|
35
|
+
|
|
36
|
+
available_node_types:
|
|
37
|
+
ray_head_default:
|
|
38
|
+
resources: {}
|
|
39
|
+
node_config:
|
|
40
|
+
plan: {{ instance_type }}
|
|
41
|
+
image: {{ image_id }}
|
|
42
|
+
location: {{ region }}
|
|
43
|
+
{% if seeweb_gpu_config is not none %}
|
|
44
|
+
gpu: {{ seeweb_gpu_config.gpu }}
|
|
45
|
+
gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
|
|
46
|
+
{% endif %}
|
|
47
|
+
disk: {{ disk_size }}
|
|
48
|
+
{% if docker_image is not none %}
|
|
49
|
+
user_customize: |
|
|
50
|
+
#!/bin/bash
|
|
51
|
+
# Auto-generated Docker installation script for Seeweb
|
|
52
|
+
LOG_FILE=/var/log/user_customize.log
|
|
53
|
+
sudo mkdir -p "$(dirname "$LOG_FILE")"
|
|
54
|
+
{
|
|
55
|
+
echo "[$(date -Is)] Cloud script: start"
|
|
56
|
+
sudo apt-get update
|
|
57
|
+
sudo apt-get install -y \
|
|
58
|
+
apt-transport-https \
|
|
59
|
+
ca-certificates \
|
|
60
|
+
curl \
|
|
61
|
+
gnupg-agent \
|
|
62
|
+
lsb-release \
|
|
63
|
+
software-properties-common
|
|
64
|
+
sudo mkdir -p /usr/share/keyrings
|
|
65
|
+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
|
66
|
+
sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
|
|
67
|
+
UBU_CODENAME="$(. /etc/os-release && echo "$VERSION_CODENAME")"
|
|
68
|
+
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu ${UBU_CODENAME} stable" | \
|
|
69
|
+
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
|
70
|
+
sudo apt-get update
|
|
71
|
+
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
|
72
|
+
echo "[$(date -Is)] Cloud script: docker installed"
|
|
73
|
+
sudo usermod -aG docker ecuser || true
|
|
74
|
+
sudo systemctl enable docker || true
|
|
75
|
+
sudo systemctl start docker || true
|
|
76
|
+
command -v docker && docker --version || echo "[$(date -Is)] docker still missing"
|
|
77
|
+
echo "[$(date -Is)] Cloud script: complete"
|
|
78
|
+
} | sudo tee -a "$LOG_FILE"
|
|
79
|
+
sudo touch /var/log/docker_install_done
|
|
80
|
+
{% endif %}
|
|
81
|
+
|
|
82
|
+
head_node_type: ray_head_default
|
|
83
|
+
|
|
84
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
85
|
+
file_mounts: {
|
|
86
|
+
"~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
|
|
87
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
88
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
89
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
90
|
+
"{{remote_path}}": "{{local_path}}",
|
|
91
|
+
{%- endfor %}
|
|
92
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
rsync_exclude: []
|
|
96
|
+
|
|
97
|
+
setup_commands:
|
|
98
|
+
- |
|
|
99
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
|
100
|
+
{{ initial_setup_command }}
|
|
101
|
+
{%- endfor %}
|
|
102
|
+
touch ~/.bashrc;
|
|
103
|
+
echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
|
|
104
|
+
echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
|
|
105
|
+
sudo systemctl stop unattended-upgrades || true;
|
|
106
|
+
sudo systemctl disable unattended-upgrades || true;
|
|
107
|
+
sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
|
|
108
|
+
|
|
109
|
+
{%- if docker_image is not none %}
|
|
110
|
+
# Docker installed via cloud-init; ensure service will be started by cloud-init
|
|
111
|
+
{%- endif %}
|
|
112
|
+
|
|
113
|
+
{{ conda_installation_commands }}
|
|
114
|
+
{{ ray_skypilot_installation_commands }}
|
|
115
|
+
{{ copy_skypilot_templates_commands }}
|
|
116
|
+
|
|
117
|
+
head_start_ray_commands:
|
|
118
|
+
- |
|
|
119
|
+
retry_ray() {
|
|
120
|
+
local n=0; local max=30
|
|
121
|
+
until [ $n -ge $max ]; do
|
|
122
|
+
export SKYPILOT_NUM_GPUS=0
|
|
123
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
124
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
125
|
+
|
|
126
|
+
ray stop || true
|
|
127
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
128
|
+
ray start --disable-usage-stats --head \
|
|
129
|
+
--port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
|
|
130
|
+
--object-manager-port=8076 \
|
|
131
|
+
--autoscaling-config=~/ray_bootstrap_config.yaml \
|
|
132
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
133
|
+
|
|
134
|
+
echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
135
|
+
sleep 5
|
|
136
|
+
done
|
|
137
|
+
[ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
|
|
138
|
+
}
|
|
139
|
+
retry_ray
|
|
140
|
+
|
|
141
|
+
worker_start_ray_commands:
|
|
142
|
+
- |
|
|
143
|
+
retry_ray() {
|
|
144
|
+
local n=0; local max=30
|
|
145
|
+
until [ $n -ge $max ]; do
|
|
146
|
+
SKYPILOT_NUM_GPUS=0
|
|
147
|
+
command -v nvidia-smi >/dev/null 2>&1 && \
|
|
148
|
+
SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
|
|
149
|
+
|
|
150
|
+
ray stop || true
|
|
151
|
+
RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
|
|
152
|
+
ray start --disable-usage-stats \
|
|
153
|
+
--address=$RAY_HEAD_IP:{{ ray_port }} \
|
|
154
|
+
--object-manager-port=8076 \
|
|
155
|
+
--num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
|
|
156
|
+
|
|
157
|
+
echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
|
|
158
|
+
sleep 5
|
|
159
|
+
done
|
|
160
|
+
[ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
|
|
161
|
+
}
|
|
162
|
+
retry_ray
|
|
163
|
+
|
|
164
|
+
head_node: {}
|
|
165
|
+
worker_nodes: {}
|
|
166
|
+
|
|
167
|
+
head_setup_commands: []
|
|
168
|
+
worker_setup_commands: []
|
|
169
|
+
|
|
170
|
+
cluster_synced_files: []
|
|
171
|
+
file_mounts_sync_continuously: False
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.shadeform
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
disable_launch_config_check: true
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: shadeform
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
ssh_key_id: {{ssh_key_id}}
|
|
18
|
+
|
|
19
|
+
available_node_types:
|
|
20
|
+
ray_head_default:
|
|
21
|
+
{%- if custom_resources %}
|
|
22
|
+
resources: {{custom_resources}}
|
|
23
|
+
{%- else %}
|
|
24
|
+
resources: {}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
node_config:
|
|
27
|
+
InstanceType: {{instance_type}}
|
|
28
|
+
PublicKey: |-
|
|
29
|
+
skypilot:ssh_public_key_content
|
|
30
|
+
|
|
31
|
+
head_node_type: ray_head_default
|
|
32
|
+
|
|
33
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
34
|
+
file_mounts: {
|
|
35
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
36
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
37
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
38
|
+
"{{remote_path}}": "{{local_path}}",
|
|
39
|
+
{%- endfor %}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
rsync_exclude: []
|
|
43
|
+
|
|
44
|
+
initialization_commands: []
|
|
45
|
+
|
|
46
|
+
# List of shell commands to run to set up nodes.
|
|
47
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
48
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
49
|
+
# items!
|
|
50
|
+
#
|
|
51
|
+
# Increment the following for catching performance bugs easier:
|
|
52
|
+
# current num items (num SSH connections): 1
|
|
53
|
+
setup_commands:
|
|
54
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
|
55
|
+
# Line 'rm ..': there is another installation of pip.
|
|
56
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
|
57
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
|
58
|
+
# Line 'mkdir -p ..': disable host key check
|
|
59
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
|
60
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
61
|
+
{{ initial_setup_command }}
|
|
62
|
+
{%- endfor %}
|
|
63
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
64
|
+
{{ conda_installation_commands }}
|
|
65
|
+
{{ ray_skypilot_installation_commands }}
|
|
66
|
+
{{ copy_skypilot_templates_commands }}
|
|
67
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
68
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
69
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
70
|
+
{{ ssh_max_sessions_config }}
|
|
71
|
+
|
|
72
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
|
73
|
+
# We do not need to list it here anymore.
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.slurm
|
|
11
|
+
|
|
12
|
+
cluster: {{slurm_cluster}}
|
|
13
|
+
partition: {{slurm_partition}}
|
|
14
|
+
|
|
15
|
+
ssh:
|
|
16
|
+
hostname: {{ssh_hostname}}
|
|
17
|
+
port: {{ssh_port}}
|
|
18
|
+
user: {{ssh_user}}
|
|
19
|
+
private_key: {{slurm_private_key}}
|
|
20
|
+
{% if slurm_proxy_command is not none %}
|
|
21
|
+
proxycommand: {{slurm_proxy_command | tojson }}
|
|
22
|
+
{% endif %}
|
|
23
|
+
|
|
24
|
+
auth:
|
|
25
|
+
ssh_user: {{ssh_user}}
|
|
26
|
+
# TODO(jwj): Modify this tmp workaround.
|
|
27
|
+
# ssh_private_key: {{ssh_private_key}}
|
|
28
|
+
ssh_private_key: {{slurm_private_key}}
|
|
29
|
+
ssh_proxy_command: {{slurm_proxy_command | tojson }}
|
|
30
|
+
|
|
31
|
+
available_node_types:
|
|
32
|
+
ray_head_default:
|
|
33
|
+
resources: {}
|
|
34
|
+
node_config:
|
|
35
|
+
# From clouds/slurm.py::Slurm.make_deploy_resources_variables.
|
|
36
|
+
instance_type: {{instance_type}}
|
|
37
|
+
disk_size: {{disk_size}}
|
|
38
|
+
cpus: {{cpus}}
|
|
39
|
+
memory: {{memory}}
|
|
40
|
+
accelerator_type: {{accelerator_type}}
|
|
41
|
+
accelerator_count: {{accelerator_count}}
|
|
42
|
+
|
|
43
|
+
# TODO: more configs that is required by the provisioner to create new
|
|
44
|
+
# instances on the FluffyCloud:
|
|
45
|
+
# sky/provision/fluffycloud/instance.py::run_instances
|
|
46
|
+
|
|
47
|
+
head_node_type: ray_head_default
|
|
48
|
+
|
|
49
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
50
|
+
file_mounts: {
|
|
51
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
52
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
53
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
54
|
+
"{{remote_path}}": "{{local_path}}",
|
|
55
|
+
{%- endfor %}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
rsync_exclude: []
|
|
59
|
+
|
|
60
|
+
initialization_commands: []
|
|
61
|
+
|
|
62
|
+
# List of shell commands to run to set up nodes.
|
|
63
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
64
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
65
|
+
# items!
|
|
66
|
+
#
|
|
67
|
+
# Increment the following for catching performance bugs easier:
|
|
68
|
+
# current num items (num SSH connections): 1
|
|
69
|
+
setup_commands:
|
|
70
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
71
|
+
{{ initial_setup_command }}
|
|
72
|
+
{%- endfor %}
|
|
73
|
+
{{ setup_sky_dirs_commands }}
|
|
74
|
+
{{ conda_installation_commands }}
|
|
75
|
+
{{ skypilot_wheel_installation_commands }}
|
|
76
|
+
{{ copy_skypilot_templates_commands }}
|
|
77
|
+
|
|
78
|
+
head_node: {}
|
|
79
|
+
worker_nodes: {}
|
|
80
|
+
|
|
81
|
+
# These fields are required for external cloud providers.
|
|
82
|
+
head_setup_commands: []
|
|
83
|
+
worker_setup_commands: []
|
|
84
|
+
cluster_synced_files: []
|
|
85
|
+
file_mounts_sync_continuously: False
|
sky/templates/vast-ray.yml.j2
CHANGED
|
@@ -10,6 +10,7 @@ provider:
|
|
|
10
10
|
module: sky.provision.vast
|
|
11
11
|
region: "{{region}}"
|
|
12
12
|
disable_launch_config_check: true
|
|
13
|
+
secure_only: {{secure_only}}
|
|
13
14
|
|
|
14
15
|
auth:
|
|
15
16
|
ssh_user: root
|
|
@@ -61,6 +62,7 @@ setup_commands:
|
|
|
61
62
|
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
62
63
|
{{ conda_installation_commands }}
|
|
63
64
|
{{ ray_skypilot_installation_commands }}
|
|
65
|
+
{{ copy_skypilot_templates_commands }}
|
|
64
66
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
65
67
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
66
68
|
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
sky/templates/vsphere-ray.yml.j2
CHANGED
|
@@ -67,6 +67,7 @@ setup_commands:
|
|
|
67
67
|
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
|
|
68
68
|
{{ conda_installation_commands }}
|
|
69
69
|
{{ ray_skypilot_installation_commands }}
|
|
70
|
+
{{ copy_skypilot_templates_commands }}
|
|
70
71
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
71
72
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
72
73
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
sky/templates/websocket_proxy.py
CHANGED
|
@@ -11,15 +11,25 @@ This script is useful for users who do not have local Kubernetes credentials.
|
|
|
11
11
|
import asyncio
|
|
12
12
|
from http.cookiejar import MozillaCookieJar
|
|
13
13
|
import os
|
|
14
|
+
import struct
|
|
14
15
|
import sys
|
|
15
|
-
|
|
16
|
+
import time
|
|
17
|
+
from typing import Dict, Optional
|
|
16
18
|
from urllib.request import Request
|
|
17
19
|
|
|
20
|
+
import requests
|
|
18
21
|
import websockets
|
|
19
22
|
from websockets.asyncio.client import ClientConnection
|
|
20
23
|
from websockets.asyncio.client import connect
|
|
21
24
|
|
|
25
|
+
from sky import exceptions
|
|
26
|
+
from sky.client import service_account_auth
|
|
27
|
+
from sky.server import constants
|
|
28
|
+
from sky.server.server import KubernetesSSHMessageType
|
|
29
|
+
from sky.skylet import constants as skylet_constants
|
|
30
|
+
|
|
22
31
|
BUFFER_SIZE = 2**16 # 64KB
|
|
32
|
+
HEARTBEAT_INTERVAL_SECONDS = 10
|
|
23
33
|
|
|
24
34
|
# Environment variable for a file path to the API cookie file.
|
|
25
35
|
# Keep in sync with server/constants.py
|
|
@@ -28,6 +38,8 @@ API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
|
|
|
28
38
|
# Keep in sync with server/constants.py
|
|
29
39
|
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
30
40
|
|
|
41
|
+
MAX_UNANSWERED_PINGS = 100
|
|
42
|
+
|
|
31
43
|
|
|
32
44
|
def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
33
45
|
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
|
@@ -49,85 +61,218 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
|
49
61
|
return {'Cookie': cookie_header}
|
|
50
62
|
|
|
51
63
|
|
|
52
|
-
async def main(url: str) -> None:
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
|
|
65
|
+
headers = {}
|
|
66
|
+
headers.update(_get_cookie_header(url))
|
|
67
|
+
headers.update(service_account_auth.get_service_account_headers())
|
|
68
|
+
try:
|
|
69
|
+
async with connect(url, ping_interval=None,
|
|
70
|
+
additional_headers=headers) as websocket:
|
|
71
|
+
await run_websocket_proxy(websocket, timestamps_supported)
|
|
72
|
+
except websockets.exceptions.InvalidStatus as e:
|
|
73
|
+
if e.response.status_code == 403:
|
|
74
|
+
print(str(exceptions.ApiServerAuthenticationError(login_url)),
|
|
75
|
+
file=sys.stderr)
|
|
63
76
|
else:
|
|
64
|
-
|
|
77
|
+
print(f'Error ssh into cluster: {e}', file=sys.stderr)
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def run_websocket_proxy(websocket: ClientConnection,
|
|
82
|
+
timestamps_supported: bool) -> None:
|
|
83
|
+
if os.isatty(sys.stdin.fileno()):
|
|
84
|
+
# pylint: disable=import-outside-toplevel
|
|
85
|
+
import termios
|
|
86
|
+
import tty
|
|
87
|
+
old_settings = termios.tcgetattr(sys.stdin.fileno())
|
|
88
|
+
tty.setraw(sys.stdin.fileno())
|
|
89
|
+
else:
|
|
90
|
+
old_settings = None
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
loop = asyncio.get_running_loop()
|
|
94
|
+
# Use asyncio.Stream primitives to wrap stdin and stdout, this is to
|
|
95
|
+
# avoid creating a new thread for each read/write operation
|
|
96
|
+
# excessively.
|
|
97
|
+
stdin_reader = asyncio.StreamReader()
|
|
98
|
+
protocol = asyncio.StreamReaderProtocol(stdin_reader)
|
|
99
|
+
await loop.connect_read_pipe(lambda: protocol, sys.stdin)
|
|
100
|
+
transport, protocol = await loop.connect_write_pipe(
|
|
101
|
+
asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
|
|
102
|
+
stdout_writer = asyncio.StreamWriter(transport, protocol, None, loop)
|
|
103
|
+
# Dictionary to store last ping time for latency measurement
|
|
104
|
+
last_ping_time_dict: Optional[Dict[int, float]] = None
|
|
105
|
+
if timestamps_supported:
|
|
106
|
+
last_ping_time_dict = {}
|
|
107
|
+
|
|
108
|
+
# Use an Event to signal when websocket is closed
|
|
109
|
+
websocket_closed_event = asyncio.Event()
|
|
110
|
+
websocket_lock = asyncio.Lock()
|
|
111
|
+
|
|
112
|
+
await asyncio.gather(
|
|
113
|
+
stdin_to_websocket(stdin_reader, websocket, timestamps_supported,
|
|
114
|
+
websocket_closed_event, websocket_lock),
|
|
115
|
+
websocket_to_stdout(websocket, stdout_writer, timestamps_supported,
|
|
116
|
+
last_ping_time_dict, websocket_closed_event,
|
|
117
|
+
websocket_lock),
|
|
118
|
+
latency_monitor(websocket, last_ping_time_dict,
|
|
119
|
+
websocket_closed_event, websocket_lock),
|
|
120
|
+
return_exceptions=True)
|
|
121
|
+
finally:
|
|
122
|
+
if old_settings:
|
|
123
|
+
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
|
124
|
+
old_settings)
|
|
125
|
+
|
|
65
126
|
|
|
127
|
+
async def latency_monitor(websocket: ClientConnection,
|
|
128
|
+
last_ping_time_dict: Optional[dict],
|
|
129
|
+
websocket_closed_event: asyncio.Event,
|
|
130
|
+
websocket_lock: asyncio.Lock):
|
|
131
|
+
"""Periodically send PING messages (type 1) to measure latency."""
|
|
132
|
+
if last_ping_time_dict is None:
|
|
133
|
+
return
|
|
134
|
+
next_id = 0
|
|
135
|
+
while not websocket_closed_event.is_set():
|
|
66
136
|
try:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
137
|
+
await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
|
|
138
|
+
if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
|
|
139
|
+
# We are not getting responses, clear the dictionary so
|
|
140
|
+
# as not to grow unbounded.
|
|
141
|
+
last_ping_time_dict.clear()
|
|
142
|
+
ping_time = time.time()
|
|
143
|
+
next_id += 1
|
|
144
|
+
last_ping_time_dict[next_id] = ping_time
|
|
145
|
+
message_header_bytes = struct.pack(
|
|
146
|
+
'!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
|
|
147
|
+
try:
|
|
148
|
+
async with websocket_lock:
|
|
149
|
+
await websocket.send(message_header_bytes)
|
|
150
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
151
|
+
# Websocket is already closed.
|
|
152
|
+
print(f'Failed to send PING message: {e}', file=sys.stderr)
|
|
153
|
+
break
|
|
154
|
+
except Exception as e:
|
|
155
|
+
print(f'Error in latency_monitor: {e}', file=sys.stderr)
|
|
156
|
+
websocket_closed_event.set()
|
|
157
|
+
raise e
|
|
85
158
|
|
|
86
159
|
|
|
87
160
|
async def stdin_to_websocket(reader: asyncio.StreamReader,
|
|
88
|
-
websocket: ClientConnection
|
|
161
|
+
websocket: ClientConnection,
|
|
162
|
+
timestamps_supported: bool,
|
|
163
|
+
websocket_closed_event: asyncio.Event,
|
|
164
|
+
websocket_lock: asyncio.Lock):
|
|
89
165
|
try:
|
|
90
|
-
while
|
|
166
|
+
while not websocket_closed_event.is_set():
|
|
91
167
|
# Read at most BUFFER_SIZE bytes, this not affect
|
|
92
168
|
# responsiveness since it will return as soon as
|
|
93
169
|
# there is at least one byte.
|
|
94
170
|
# The BUFFER_SIZE is chosen to be large enough to improve
|
|
95
171
|
# throughput.
|
|
96
172
|
data = await reader.read(BUFFER_SIZE)
|
|
173
|
+
|
|
97
174
|
if not data:
|
|
98
175
|
break
|
|
99
|
-
|
|
176
|
+
if timestamps_supported:
|
|
177
|
+
# Send message with type 0 to indicate data.
|
|
178
|
+
message_type_bytes = struct.pack(
|
|
179
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
180
|
+
data = message_type_bytes + data
|
|
181
|
+
async with websocket_lock:
|
|
182
|
+
await websocket.send(data)
|
|
183
|
+
|
|
100
184
|
except Exception as e: # pylint: disable=broad-except
|
|
101
185
|
print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
|
|
102
186
|
finally:
|
|
103
|
-
|
|
187
|
+
async with websocket_lock:
|
|
188
|
+
await websocket.close()
|
|
189
|
+
websocket_closed_event.set()
|
|
104
190
|
|
|
105
191
|
|
|
106
192
|
async def websocket_to_stdout(websocket: ClientConnection,
|
|
107
|
-
writer: asyncio.StreamWriter
|
|
193
|
+
writer: asyncio.StreamWriter,
|
|
194
|
+
timestamps_supported: bool,
|
|
195
|
+
last_ping_time_dict: Optional[dict],
|
|
196
|
+
websocket_closed_event: asyncio.Event,
|
|
197
|
+
websocket_lock: asyncio.Lock):
|
|
108
198
|
try:
|
|
109
|
-
while
|
|
199
|
+
while not websocket_closed_event.is_set():
|
|
110
200
|
message = await websocket.recv()
|
|
201
|
+
if (timestamps_supported and len(message) > 0 and
|
|
202
|
+
last_ping_time_dict is not None):
|
|
203
|
+
message_type = struct.unpack('!B', message[:1])[0]
|
|
204
|
+
if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
|
|
205
|
+
# Regular data - strip type byte and write to stdout
|
|
206
|
+
message = message[1:]
|
|
207
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG.value:
|
|
208
|
+
# PONG response - calculate latency and send measurement
|
|
209
|
+
if not len(message) == struct.calcsize('!BI'):
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f'Invalid PONG message length: {len(message)}')
|
|
212
|
+
pong_id = struct.unpack('!I', message[1:5])[0]
|
|
213
|
+
pong_time = time.time()
|
|
214
|
+
|
|
215
|
+
ping_time = last_ping_time_dict.pop(pong_id, None)
|
|
216
|
+
|
|
217
|
+
if ping_time is None:
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
latency_seconds = pong_time - ping_time
|
|
221
|
+
latency_ms = int(latency_seconds * 1000)
|
|
222
|
+
|
|
223
|
+
# Send latency measurement (type 2)
|
|
224
|
+
message_type_bytes = struct.pack(
|
|
225
|
+
'!B',
|
|
226
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
|
|
227
|
+
latency_bytes = struct.pack('!Q', latency_ms)
|
|
228
|
+
message = message_type_bytes + latency_bytes
|
|
229
|
+
# Send to server.
|
|
230
|
+
async with websocket_lock:
|
|
231
|
+
await websocket.send(message)
|
|
232
|
+
continue
|
|
233
|
+
# No timestamps support, write directly
|
|
111
234
|
writer.write(message)
|
|
112
235
|
await writer.drain()
|
|
113
236
|
except websockets.exceptions.ConnectionClosed:
|
|
114
237
|
print('WebSocket connection closed', file=sys.stderr)
|
|
115
238
|
except Exception as e: # pylint: disable=broad-except
|
|
116
239
|
print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
|
|
240
|
+
raise e
|
|
241
|
+
finally:
|
|
242
|
+
async with websocket_lock:
|
|
243
|
+
await websocket.close()
|
|
244
|
+
websocket_closed_event.set()
|
|
117
245
|
|
|
118
246
|
|
|
119
247
|
if __name__ == '__main__':
|
|
120
248
|
server_url = sys.argv[1].strip('/')
|
|
121
|
-
if '://' not in server_url:
|
|
122
|
-
# Keep backward compatibility for legacy server URLs without protocol
|
|
123
|
-
# TODO(aylei): Remove this after 0.10.0
|
|
124
|
-
server_url = f'http://{server_url}'
|
|
125
249
|
|
|
250
|
+
disable_latency_measurement = os.environ.get(
|
|
251
|
+
skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
|
|
252
|
+
if disable_latency_measurement:
|
|
253
|
+
timestamps_are_supported = False
|
|
254
|
+
else:
|
|
255
|
+
# TODO(aylei): remove the separate /api/health call and use the header
|
|
256
|
+
# during websocket handshake to determine the server version.
|
|
257
|
+
health_url = f'{server_url}/api/health'
|
|
258
|
+
cookie_hdr = _get_cookie_header(health_url)
|
|
259
|
+
health_response = requests.get(health_url, headers=cookie_hdr)
|
|
260
|
+
health_data = health_response.json()
|
|
261
|
+
timestamps_are_supported = int(health_data.get('api_version', 0)) > 21
|
|
262
|
+
|
|
263
|
+
# Capture the original API server URL for login hint if authentication
|
|
264
|
+
# is required.
|
|
265
|
+
_login_url = server_url
|
|
126
266
|
server_proto, server_fqdn = server_url.split('://')
|
|
127
267
|
websocket_proto = 'ws'
|
|
128
268
|
if server_proto == 'https':
|
|
129
269
|
websocket_proto = 'wss'
|
|
130
270
|
server_url = f'{websocket_proto}://{server_fqdn}'
|
|
271
|
+
|
|
272
|
+
client_version_str = (f'&client_version={constants.API_VERSION}'
|
|
273
|
+
if timestamps_are_supported else '')
|
|
274
|
+
|
|
131
275
|
websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
|
|
132
|
-
f'?cluster_name={sys.argv[2]}'
|
|
133
|
-
|
|
276
|
+
f'?cluster_name={sys.argv[2]}'
|
|
277
|
+
f'{client_version_str}')
|
|
278
|
+
asyncio.run(main(websocket_url, timestamps_are_supported, _login_url))
|