skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
|
|
3
|
+
# Used as kubectl exec credential plugin to establish SSH tunnel on demand.
|
|
4
|
+
# Returns a valid credential format for kubectl with expiration. The expiration
|
|
5
|
+
# is calculated based on the TTL argument and is required to force kubectl to
|
|
6
|
+
# check the tunnel status frequently.
|
|
7
|
+
|
|
8
|
+
# Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
|
|
9
|
+
|
|
10
|
+
# Default time-to-live for credential in seconds
|
|
11
|
+
# This forces kubectl to check the tunnel status frequently
|
|
12
|
+
TTL_SECONDS=30
|
|
13
|
+
|
|
14
|
+
# Parse arguments
|
|
15
|
+
USE_SSH_CONFIG=0
|
|
16
|
+
SSH_KEY=""
|
|
17
|
+
CONTEXT=""
|
|
18
|
+
HOST=""
|
|
19
|
+
USER=""
|
|
20
|
+
PORT=6443 # Default port if not specified
|
|
21
|
+
|
|
22
|
+
# Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
|
|
23
|
+
debug_log() {
|
|
24
|
+
local message="$(date): $1"
|
|
25
|
+
echo "$message" >> "$LOG_FILE"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Generate expiration timestamp for credential
|
|
29
|
+
generate_expiration_timestamp() {
|
|
30
|
+
# Try macOS date format first, fallback to Linux format
|
|
31
|
+
date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
|
|
35
|
+
acquire_lock() {
|
|
36
|
+
# Check for flock command
|
|
37
|
+
if ! command -v flock >/dev/null 2>&1; then
|
|
38
|
+
debug_log "flock command not available, using alternative lock mechanism"
|
|
39
|
+
# Simple file-based locking
|
|
40
|
+
if [ -f "$LOCK_FILE" ]; then
|
|
41
|
+
lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
|
|
42
|
+
if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
|
|
43
|
+
debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
|
|
44
|
+
return 1
|
|
45
|
+
else
|
|
46
|
+
# Stale lock file
|
|
47
|
+
debug_log "Removing stale lock file"
|
|
48
|
+
rm -f "$LOCK_FILE"
|
|
49
|
+
fi
|
|
50
|
+
fi
|
|
51
|
+
# Create our lock
|
|
52
|
+
echo $$ > "$LOCK_FILE"
|
|
53
|
+
return 0
|
|
54
|
+
else
|
|
55
|
+
# Use flock for better locking
|
|
56
|
+
exec 9>"$LOCK_FILE"
|
|
57
|
+
if ! flock -n 9; then
|
|
58
|
+
debug_log "Another process is starting the tunnel, waiting briefly"
|
|
59
|
+
return 1
|
|
60
|
+
fi
|
|
61
|
+
return 0
|
|
62
|
+
fi
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Release the lock
|
|
66
|
+
release_lock() {
|
|
67
|
+
if command -v flock >/dev/null 2>&1; then
|
|
68
|
+
# Using flock
|
|
69
|
+
exec 9>&- # Close file descriptor to release lock
|
|
70
|
+
else
|
|
71
|
+
# Using simple lock
|
|
72
|
+
rm -f "$LOCK_FILE"
|
|
73
|
+
fi
|
|
74
|
+
debug_log "Lock released"
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Generate SSH command based on available tools and parameters
|
|
78
|
+
generate_ssh_command() {
|
|
79
|
+
# Check for autossh
|
|
80
|
+
if ! command -v autossh >/dev/null 2>&1; then
|
|
81
|
+
debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
|
|
82
|
+
debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
|
|
83
|
+
|
|
84
|
+
# Fall back to regular ssh
|
|
85
|
+
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
86
|
+
SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
87
|
+
else
|
|
88
|
+
SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
89
|
+
|
|
90
|
+
# Add SSH key if provided
|
|
91
|
+
if [[ -n "$SSH_KEY" ]]; then
|
|
92
|
+
SSH_CMD+=("-i" "$SSH_KEY")
|
|
93
|
+
fi
|
|
94
|
+
|
|
95
|
+
# Add user@host
|
|
96
|
+
SSH_CMD+=("$USER@$HOST")
|
|
97
|
+
fi
|
|
98
|
+
else
|
|
99
|
+
# Configure autossh
|
|
100
|
+
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
101
|
+
SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
102
|
+
else
|
|
103
|
+
SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
104
|
+
|
|
105
|
+
# Add SSH key if provided
|
|
106
|
+
if [[ -n "$SSH_KEY" ]]; then
|
|
107
|
+
SSH_CMD+=("-i" "$SSH_KEY")
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
# Add user@host
|
|
111
|
+
SSH_CMD+=("$USER@$HOST")
|
|
112
|
+
fi
|
|
113
|
+
fi
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Function to read certificate files if they exist
|
|
117
|
+
read_certificate_data() {
|
|
118
|
+
local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
|
|
119
|
+
local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
|
|
120
|
+
local cert_data=""
|
|
121
|
+
local key_data=""
|
|
122
|
+
|
|
123
|
+
if [[ -f "$client_cert_file" ]]; then
|
|
124
|
+
# Read the certificate file as is - it's already in PEM format
|
|
125
|
+
cert_data=$(cat "$client_cert_file")
|
|
126
|
+
debug_log "Found client certificate data for context $CONTEXT"
|
|
127
|
+
|
|
128
|
+
# Log the first and last few characters to verify PEM format
|
|
129
|
+
local cert_start=$(head -1 "$client_cert_file")
|
|
130
|
+
local cert_end=$(tail -1 "$client_cert_file")
|
|
131
|
+
debug_log "Certificate starts with: $cert_start"
|
|
132
|
+
debug_log "Certificate ends with: $cert_end"
|
|
133
|
+
|
|
134
|
+
# Check if it has proper PEM format
|
|
135
|
+
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
|
|
136
|
+
debug_log "WARNING: Certificate file may not be in proper PEM format"
|
|
137
|
+
# Try to fix it if needed
|
|
138
|
+
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
|
|
139
|
+
echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
|
|
140
|
+
cat "$client_cert_file" >> "$client_cert_file.fixed"
|
|
141
|
+
echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
|
|
142
|
+
mv "$client_cert_file.fixed" "$client_cert_file"
|
|
143
|
+
cert_data=$(cat "$client_cert_file")
|
|
144
|
+
debug_log "Fixed certificate format by adding BEGIN/END markers"
|
|
145
|
+
fi
|
|
146
|
+
fi
|
|
147
|
+
fi
|
|
148
|
+
|
|
149
|
+
if [[ -f "$client_key_file" ]]; then
|
|
150
|
+
# Read the key file as is - it's already in PEM format
|
|
151
|
+
key_data=$(cat "$client_key_file")
|
|
152
|
+
debug_log "Found client key data for context $CONTEXT"
|
|
153
|
+
|
|
154
|
+
# Log the first and last few characters to verify PEM format
|
|
155
|
+
local key_start=$(head -1 "$client_key_file")
|
|
156
|
+
local key_end=$(tail -1 "$client_key_file")
|
|
157
|
+
debug_log "Key starts with: $key_start"
|
|
158
|
+
debug_log "Key ends with: $key_end"
|
|
159
|
+
|
|
160
|
+
# Check if it has proper PEM format
|
|
161
|
+
if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
|
|
162
|
+
debug_log "WARNING: Key file may not be in proper PEM format"
|
|
163
|
+
# Try to fix it if needed
|
|
164
|
+
if ! grep -q "BEGIN" "$client_key_file"; then
|
|
165
|
+
echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
|
|
166
|
+
cat "$client_key_file" >> "$client_key_file.fixed"
|
|
167
|
+
echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
|
|
168
|
+
mv "$client_key_file.fixed" "$client_key_file"
|
|
169
|
+
key_data=$(cat "$client_key_file")
|
|
170
|
+
debug_log "Fixed key format by adding BEGIN/END markers"
|
|
171
|
+
fi
|
|
172
|
+
fi
|
|
173
|
+
fi
|
|
174
|
+
|
|
175
|
+
echo "$cert_data:$key_data"
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Function to generate credentials JSON
|
|
179
|
+
generate_credentials_json() {
|
|
180
|
+
local expiration_time=$(generate_expiration_timestamp)
|
|
181
|
+
local cert_bundle=$(read_certificate_data)
|
|
182
|
+
local client_cert_data=${cert_bundle%:*}
|
|
183
|
+
local client_key_data=${cert_bundle#*:}
|
|
184
|
+
|
|
185
|
+
if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
|
|
186
|
+
# Debug the certificate data
|
|
187
|
+
debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
|
|
188
|
+
debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
|
|
189
|
+
|
|
190
|
+
# Check if we can create proper JSON with `jq`
|
|
191
|
+
if ! command -v jq &>/dev/null; then
|
|
192
|
+
echo "jq is not installed. Please install jq to use this script." >&2
|
|
193
|
+
exit 1
|
|
194
|
+
fi
|
|
195
|
+
debug_log "Using jq for JSON formatting"
|
|
196
|
+
|
|
197
|
+
# Create a temporary file for the JSON output to avoid shell escaping issues
|
|
198
|
+
local TEMP_JSON_FILE=$(mktemp)
|
|
199
|
+
|
|
200
|
+
# Write the JSON to the temporary file using jq for proper JSON formatting
|
|
201
|
+
cat > "$TEMP_JSON_FILE" << EOL
|
|
202
|
+
{
|
|
203
|
+
"apiVersion": "client.authentication.k8s.io/v1beta1",
|
|
204
|
+
"kind": "ExecCredential",
|
|
205
|
+
"status": {
|
|
206
|
+
"clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
|
|
207
|
+
"clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
|
|
208
|
+
"expirationTimestamp": "$expiration_time"
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
EOL
|
|
212
|
+
|
|
213
|
+
# Read the JSON from the file
|
|
214
|
+
local json_response=$(cat "$TEMP_JSON_FILE")
|
|
215
|
+
|
|
216
|
+
# Clean up
|
|
217
|
+
rm -f "$TEMP_JSON_FILE"
|
|
218
|
+
|
|
219
|
+
# Output the JSON
|
|
220
|
+
echo "$json_response"
|
|
221
|
+
else
|
|
222
|
+
# Fallback to token-based credential for tunnel-only authentication
|
|
223
|
+
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
|
224
|
+
fi
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
while [[ $# -gt 0 ]]; do
|
|
228
|
+
case $1 in
|
|
229
|
+
--use-ssh-config)
|
|
230
|
+
USE_SSH_CONFIG=1
|
|
231
|
+
shift
|
|
232
|
+
;;
|
|
233
|
+
--ssh-key)
|
|
234
|
+
SSH_KEY="$2"
|
|
235
|
+
shift 2
|
|
236
|
+
;;
|
|
237
|
+
--context)
|
|
238
|
+
CONTEXT="$2"
|
|
239
|
+
shift 2
|
|
240
|
+
;;
|
|
241
|
+
--port)
|
|
242
|
+
PORT="$2"
|
|
243
|
+
shift 2
|
|
244
|
+
;;
|
|
245
|
+
--host)
|
|
246
|
+
HOST="$2"
|
|
247
|
+
shift 2
|
|
248
|
+
;;
|
|
249
|
+
--user)
|
|
250
|
+
USER="$2"
|
|
251
|
+
shift 2
|
|
252
|
+
;;
|
|
253
|
+
--ttl)
|
|
254
|
+
TTL_SECONDS="$2"
|
|
255
|
+
shift 2
|
|
256
|
+
;;
|
|
257
|
+
*)
|
|
258
|
+
echo "Unknown parameter: $1" >&2
|
|
259
|
+
exit 1
|
|
260
|
+
;;
|
|
261
|
+
esac
|
|
262
|
+
done
|
|
263
|
+
|
|
264
|
+
# Validate required parameters
|
|
265
|
+
if [[ -z "$HOST" ]]; then
|
|
266
|
+
echo "Error: --host parameter is required" >&2
|
|
267
|
+
exit 1
|
|
268
|
+
fi
|
|
269
|
+
|
|
270
|
+
# Setup directories
|
|
271
|
+
TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
|
|
272
|
+
mkdir -p "$TUNNEL_DIR"
|
|
273
|
+
|
|
274
|
+
# Get context name for PID file
|
|
275
|
+
if [[ -z "$CONTEXT" ]]; then
|
|
276
|
+
CONTEXT="default"
|
|
277
|
+
fi
|
|
278
|
+
|
|
279
|
+
PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
|
|
280
|
+
LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
|
|
281
|
+
LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
|
|
282
|
+
|
|
283
|
+
debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
|
|
284
|
+
debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
|
|
285
|
+
|
|
286
|
+
# Check if specified port is already in use (tunnel may be running)
|
|
287
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
288
|
+
debug_log "Port $PORT already in use, checking if it's our tunnel"
|
|
289
|
+
|
|
290
|
+
# Check if there's a PID file and if that process is running
|
|
291
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
292
|
+
OLD_PID=$(cat "$PID_FILE")
|
|
293
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
294
|
+
debug_log "Tunnel appears to be running with PID $OLD_PID"
|
|
295
|
+
else
|
|
296
|
+
debug_log "PID file exists but process $OLD_PID is not running"
|
|
297
|
+
fi
|
|
298
|
+
else
|
|
299
|
+
debug_log "Port $PORT is in use but no PID file exists"
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
# Return valid credential format for kubectl with expiration
|
|
303
|
+
generate_credentials_json
|
|
304
|
+
exit 0
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
# Try to acquire the lock
|
|
308
|
+
if ! acquire_lock; then
|
|
309
|
+
# Wait briefly for the tunnel to be established
|
|
310
|
+
for i in {1..10}; do
|
|
311
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
312
|
+
debug_log "Tunnel is now active"
|
|
313
|
+
|
|
314
|
+
# Return valid credential format for kubectl with expiration
|
|
315
|
+
generate_credentials_json
|
|
316
|
+
exit 0
|
|
317
|
+
fi
|
|
318
|
+
sleep 0.2
|
|
319
|
+
done
|
|
320
|
+
debug_log "Waited for tunnel but port $PORT still not available"
|
|
321
|
+
fi
|
|
322
|
+
|
|
323
|
+
# Check if we have a PID file with running process
|
|
324
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
325
|
+
OLD_PID=$(cat "$PID_FILE")
|
|
326
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
327
|
+
# Process exists but port isn't open - something's wrong, kill it
|
|
328
|
+
kill "$OLD_PID" 2>/dev/null
|
|
329
|
+
debug_log "Killed stale tunnel process $OLD_PID"
|
|
330
|
+
else
|
|
331
|
+
debug_log "PID file exists but process $OLD_PID is not running anymore"
|
|
332
|
+
fi
|
|
333
|
+
# Remove the stale PID file
|
|
334
|
+
rm -f "$PID_FILE"
|
|
335
|
+
fi
|
|
336
|
+
|
|
337
|
+
# Generate the SSH command
|
|
338
|
+
generate_ssh_command
|
|
339
|
+
|
|
340
|
+
debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
|
|
341
|
+
|
|
342
|
+
# Start the tunnel in foreground and wait for it to establish
|
|
343
|
+
"${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
|
|
344
|
+
TUNNEL_PID=$!
|
|
345
|
+
|
|
346
|
+
# Save PID
|
|
347
|
+
echo $TUNNEL_PID > "$PID_FILE"
|
|
348
|
+
debug_log "Tunnel started with PID $TUNNEL_PID"
|
|
349
|
+
|
|
350
|
+
# Wait for tunnel to establish
|
|
351
|
+
tunnel_up=0
|
|
352
|
+
for i in {1..20}; do
|
|
353
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
354
|
+
debug_log "Tunnel established successfully on port $PORT"
|
|
355
|
+
tunnel_up=1
|
|
356
|
+
break
|
|
357
|
+
fi
|
|
358
|
+
sleep 0.2
|
|
359
|
+
done
|
|
360
|
+
|
|
361
|
+
# Clean up lock file
|
|
362
|
+
release_lock
|
|
363
|
+
|
|
364
|
+
# Check if the tunnel process is still running
|
|
365
|
+
if ! kill -0 $TUNNEL_PID 2>/dev/null; then
|
|
366
|
+
debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
|
|
367
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
368
|
+
rm -f "$PID_FILE"
|
|
369
|
+
fi
|
|
370
|
+
# Return error in case of tunnel failure
|
|
371
|
+
echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
|
|
372
|
+
exit 1
|
|
373
|
+
elif [[ $tunnel_up -eq 0 ]]; then
|
|
374
|
+
debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
|
|
375
|
+
fi
|
|
376
|
+
|
|
377
|
+
# Return valid credential format with certificates if available
|
|
378
|
+
generate_credentials_json
|
|
379
|
+
exit 0
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Utility functions for managing SSH node pools."""
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
6
|
+
import uuid
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from sky.utils import ux_utils
|
|
11
|
+
|
|
12
|
+
DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
|
13
|
+
RED = '\033[0;31m'
|
|
14
|
+
NC = '\033[0m' # No color
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def check_host_in_ssh_config(hostname: str) -> bool:
|
|
18
|
+
"""Return True iff *hostname* matches at least one `Host`/`Match` stanza
|
|
19
|
+
in the user's OpenSSH client configuration (including anything pulled in
|
|
20
|
+
via Include).
|
|
21
|
+
|
|
22
|
+
It calls: ssh -vvG <hostname> -o ConnectTimeout=0
|
|
23
|
+
which:
|
|
24
|
+
• -G expands the effective config without connecting
|
|
25
|
+
• -vv prints debug lines that show which stanzas are applied
|
|
26
|
+
• ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
|
|
27
|
+
|
|
28
|
+
No config files are opened or parsed manually.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
hostname : str
|
|
33
|
+
The alias/IP/FQDN you want to test.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
bool
|
|
38
|
+
True – a specific stanza matched the host
|
|
39
|
+
False – nothing but the global defaults (`Host *`) applied
|
|
40
|
+
"""
|
|
41
|
+
# We direct stderr→stdout because debug output goes to stderr.
|
|
42
|
+
proc = subprocess.run(
|
|
43
|
+
['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
|
|
44
|
+
text=True,
|
|
45
|
+
stdout=subprocess.PIPE,
|
|
46
|
+
stderr=subprocess.STDOUT,
|
|
47
|
+
check=False, # we only want the text, not to raise
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Look for lines like:
|
|
51
|
+
# debug1: ~/.ssh/config line 42: Applying options for <hostname>
|
|
52
|
+
# Anything other than "*"
|
|
53
|
+
pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
|
|
54
|
+
re.MULTILINE)
|
|
55
|
+
|
|
56
|
+
return bool(pattern.search(proc.stdout))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class UniqueKeySafeLoader(yaml.SafeLoader):
|
|
60
|
+
"""Custom YAML loader that raises an error if there are duplicate keys."""
|
|
61
|
+
|
|
62
|
+
def construct_mapping(self, node, deep=False):
|
|
63
|
+
mapping = set()
|
|
64
|
+
for key_node, _ in node.value:
|
|
65
|
+
key = self.construct_object(key_node, deep=deep)
|
|
66
|
+
if key in mapping:
|
|
67
|
+
raise yaml.constructor.ConstructorError(
|
|
68
|
+
note=(f'Duplicate key found: {key!r}.\n'
|
|
69
|
+
'Please remove one of them from the YAML file.'))
|
|
70
|
+
mapping.add(key)
|
|
71
|
+
return super().construct_mapping(node, deep)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
|
75
|
+
"""Load SSH targets from YAML file."""
|
|
76
|
+
if not os.path.exists(file_path):
|
|
77
|
+
with ux_utils.print_exception_no_traceback():
|
|
78
|
+
raise ValueError(f'SSH Node Pools file not found: {file_path}')
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
82
|
+
targets = yaml.load(f, Loader=UniqueKeySafeLoader)
|
|
83
|
+
return targets
|
|
84
|
+
except yaml.constructor.ConstructorError as e:
|
|
85
|
+
with ux_utils.print_exception_no_traceback():
|
|
86
|
+
raise ValueError(e.note) from e
|
|
87
|
+
except (yaml.YAMLError, IOError, OSError) as e:
|
|
88
|
+
with ux_utils.print_exception_no_traceback():
|
|
89
|
+
raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_cluster_config(
|
|
93
|
+
targets: Dict[str, Any],
|
|
94
|
+
cluster_name: Optional[str] = None,
|
|
95
|
+
file_path: str = DEFAULT_SSH_NODE_POOLS_PATH) -> Dict[str, Any]:
|
|
96
|
+
"""Get configuration for specific clusters or all clusters."""
|
|
97
|
+
if not targets:
|
|
98
|
+
with ux_utils.print_exception_no_traceback():
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f'No clusters defined in SSH Node Pools file {file_path}')
|
|
101
|
+
|
|
102
|
+
if cluster_name:
|
|
103
|
+
if cluster_name not in targets:
|
|
104
|
+
with ux_utils.print_exception_no_traceback():
|
|
105
|
+
raise ValueError(f'Cluster {cluster_name!r} not found in '
|
|
106
|
+
f'SSH Node Pools file {file_path}')
|
|
107
|
+
return {cluster_name: targets[cluster_name]}
|
|
108
|
+
|
|
109
|
+
# Return all clusters if no specific cluster is specified
|
|
110
|
+
return targets
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def prepare_hosts_info(
|
|
114
|
+
cluster_name: str,
|
|
115
|
+
cluster_config: Dict[str, Any],
|
|
116
|
+
upload_ssh_key_func: Optional[Callable[[str, str], str]] = None
|
|
117
|
+
) -> List[Dict[str, str]]:
|
|
118
|
+
"""Prepare list of hosts with resolved user, identity_file, and password.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
cluster_name: The name of the cluster.
|
|
122
|
+
cluster_config: The configuration for the cluster.
|
|
123
|
+
upload_ssh_key_func: A function to upload the SSH key to the remote
|
|
124
|
+
server and wait for the key to be uploaded. This function will take
|
|
125
|
+
the key name and the local key file path as input, and return the
|
|
126
|
+
path for the remote SSH key file on the API server. This function
|
|
127
|
+
will only be set in `sky ssh up -f` mode, and if this function is
|
|
128
|
+
set, any ssh config will not be allowed as we don't support
|
|
129
|
+
uploading any ssh config to the API server.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A list of hosts with resolved user, identity_file, and password.
|
|
133
|
+
"""
|
|
134
|
+
if 'hosts' not in cluster_config or not cluster_config['hosts']:
|
|
135
|
+
with ux_utils.print_exception_no_traceback():
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f'No hosts defined in cluster {cluster_name} configuration')
|
|
138
|
+
|
|
139
|
+
# Get cluster-level defaults
|
|
140
|
+
cluster_user = cluster_config.get('user', '')
|
|
141
|
+
cluster_identity_file = os.path.expanduser(
|
|
142
|
+
cluster_config.get('identity_file', ''))
|
|
143
|
+
cluster_password = cluster_config.get('password', '')
|
|
144
|
+
|
|
145
|
+
# Check if cluster identity file exists
|
|
146
|
+
if cluster_identity_file and not os.path.isfile(cluster_identity_file):
|
|
147
|
+
with ux_utils.print_exception_no_traceback():
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f'SSH Identity File Missing: {cluster_identity_file}')
|
|
150
|
+
|
|
151
|
+
use_cluster_config_msg = (f'Cluster {cluster_name} uses SSH config '
|
|
152
|
+
'for hostname {host}, which is not '
|
|
153
|
+
'supported by the -f flag. Please use a '
|
|
154
|
+
'dict with `ip` field instead.')
|
|
155
|
+
|
|
156
|
+
def _maybe_hardcode_identity_file(i: int, identity_file: str) -> str:
|
|
157
|
+
if upload_ssh_key_func is None:
|
|
158
|
+
return identity_file
|
|
159
|
+
if not os.path.exists(os.path.expanduser(identity_file)):
|
|
160
|
+
with ux_utils.print_exception_no_traceback():
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f'Identity file {identity_file} does not exist.')
|
|
163
|
+
key_name = f'{cluster_name}-{i}-{str(uuid.uuid4())[:4]}'
|
|
164
|
+
key_file_on_api_server = upload_ssh_key_func(key_name, identity_file)
|
|
165
|
+
return key_file_on_api_server
|
|
166
|
+
|
|
167
|
+
hosts_info = []
|
|
168
|
+
for i, host in enumerate(cluster_config['hosts']):
|
|
169
|
+
# Host can be a string (IP or SSH config hostname) or a dict
|
|
170
|
+
if isinstance(host, str):
|
|
171
|
+
# Check if this is an SSH config hostname
|
|
172
|
+
is_ssh_config_host = check_host_in_ssh_config(host)
|
|
173
|
+
if upload_ssh_key_func is not None and is_ssh_config_host:
|
|
174
|
+
with ux_utils.print_exception_no_traceback():
|
|
175
|
+
raise ValueError(use_cluster_config_msg.format(host=host))
|
|
176
|
+
|
|
177
|
+
hosts_info.append({
|
|
178
|
+
'ip': host,
|
|
179
|
+
'user': '' if is_ssh_config_host else cluster_user,
|
|
180
|
+
'identity_file': '' if is_ssh_config_host else
|
|
181
|
+
_maybe_hardcode_identity_file(
|
|
182
|
+
i, cluster_identity_file),
|
|
183
|
+
'password': cluster_password,
|
|
184
|
+
'use_ssh_config': is_ssh_config_host
|
|
185
|
+
})
|
|
186
|
+
else:
|
|
187
|
+
# It's a dict with potential overrides
|
|
188
|
+
if 'ip' not in host:
|
|
189
|
+
print(f'{RED}Warning: Host missing \'ip\' field, '
|
|
190
|
+
f'skipping: {host}{NC}')
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Check if this is an SSH config hostname
|
|
194
|
+
is_ssh_config_host = check_host_in_ssh_config(host['ip'])
|
|
195
|
+
if upload_ssh_key_func is not None and is_ssh_config_host:
|
|
196
|
+
with ux_utils.print_exception_no_traceback():
|
|
197
|
+
raise ValueError(use_cluster_config_msg.format(host=host))
|
|
198
|
+
|
|
199
|
+
# Use host-specific values or fall back to cluster defaults
|
|
200
|
+
host_user = '' if is_ssh_config_host else host.get(
|
|
201
|
+
'user', cluster_user)
|
|
202
|
+
host_identity_file = '' if is_ssh_config_host else (
|
|
203
|
+
_maybe_hardcode_identity_file(
|
|
204
|
+
i, host.get('identity_file', cluster_identity_file)))
|
|
205
|
+
host_identity_file = os.path.expanduser(host_identity_file)
|
|
206
|
+
host_password = host.get('password', cluster_password)
|
|
207
|
+
|
|
208
|
+
if host_identity_file and not os.path.isfile(host_identity_file):
|
|
209
|
+
with ux_utils.print_exception_no_traceback():
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f'SSH Identity File Missing: {host_identity_file}')
|
|
212
|
+
|
|
213
|
+
hosts_info.append({
|
|
214
|
+
'ip': host['ip'],
|
|
215
|
+
'user': host_user,
|
|
216
|
+
'identity_file': host_identity_file,
|
|
217
|
+
'password': host_password,
|
|
218
|
+
'use_ssh_config': is_ssh_config_host
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
return hosts_info
|
sky/utils/kubernetes_enums.py
CHANGED
|
@@ -2,26 +2,13 @@
|
|
|
2
2
|
import enum
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
# TODO(kevin): Remove this enum in v0.13.0.
|
|
5
6
|
class KubernetesNetworkingMode(enum.Enum):
|
|
6
|
-
"""Enum for the different types of networking modes for accessing
|
|
7
|
-
jump pods.
|
|
7
|
+
"""Enum for the different types of networking modes for accessing pods.
|
|
8
8
|
"""
|
|
9
9
|
NODEPORT = 'nodeport'
|
|
10
10
|
PORTFORWARD = 'portforward'
|
|
11
11
|
|
|
12
|
-
@classmethod
|
|
13
|
-
def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
|
|
14
|
-
"""Returns the enum value for the given string."""
|
|
15
|
-
if mode.lower() == cls.NODEPORT.value:
|
|
16
|
-
return cls.NODEPORT
|
|
17
|
-
elif mode.lower() == cls.PORTFORWARD.value:
|
|
18
|
-
return cls.PORTFORWARD
|
|
19
|
-
else:
|
|
20
|
-
raise ValueError(f'Unsupported kubernetes networking mode: '
|
|
21
|
-
f'{mode}. The mode must be either '
|
|
22
|
-
f'\'{cls.PORTFORWARD.value}\' or '
|
|
23
|
-
f'\'{cls.NODEPORT.value}\'. ')
|
|
24
|
-
|
|
25
12
|
|
|
26
13
|
class KubernetesServiceType(enum.Enum):
|
|
27
14
|
"""Enum for the different types of services."""
|
|
@@ -42,4 +29,10 @@ class KubernetesAutoscalerType(enum.Enum):
|
|
|
42
29
|
"""Enum for the different types of cluster autoscalers for Kubernetes."""
|
|
43
30
|
GKE = 'gke'
|
|
44
31
|
KARPENTER = 'karpenter'
|
|
32
|
+
COREWEAVE = 'coreweave'
|
|
45
33
|
GENERIC = 'generic'
|
|
34
|
+
|
|
35
|
+
def emits_autoscale_event(self) -> bool:
|
|
36
|
+
"""Returns whether specific autoscaler emits the event reason
|
|
37
|
+
TriggeredScaleUp."""
|
|
38
|
+
return self not in {self.KARPENTER}
|