skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,807 @@
|
|
|
1
|
+
"""Seeweb provisioner for SkyPilot / Ray autoscaler.
|
|
2
|
+
|
|
3
|
+
Prerequisites:
|
|
4
|
+
pip install ecsapi
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import subprocess
|
|
9
|
+
import time
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from sky import sky_logging
|
|
13
|
+
from sky.adaptors import seeweb as seeweb_adaptor
|
|
14
|
+
from sky.provision import common
|
|
15
|
+
from sky.provision.common import ClusterInfo
|
|
16
|
+
from sky.provision.common import InstanceInfo
|
|
17
|
+
from sky.provision.common import ProvisionConfig
|
|
18
|
+
from sky.provision.common import ProvisionRecord
|
|
19
|
+
from sky.utils import auth_utils
|
|
20
|
+
from sky.utils import command_runner # Unified SSH helper
|
|
21
|
+
from sky.utils import common_utils
|
|
22
|
+
from sky.utils import status_lib
|
|
23
|
+
|
|
24
|
+
logger = sky_logging.init_logger(__name__)
|
|
25
|
+
|
|
26
|
+
# Singleton Seeweb client reused across the module
|
|
27
|
+
_seeweb_client = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_seeweb_client():
|
|
31
|
+
"""Return a singleton Seeweb ECS API client."""
|
|
32
|
+
global _seeweb_client
|
|
33
|
+
if _seeweb_client is None:
|
|
34
|
+
# Initialize via adaptor's cached client
|
|
35
|
+
_seeweb_client = seeweb_adaptor.client()
|
|
36
|
+
return _seeweb_client
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# --------------------------------------------------------------------------- #
|
|
40
|
+
# Useful constants
|
|
41
|
+
# --------------------------------------------------------------------------- #
|
|
42
|
+
_POLL_INTERVAL = 5 # sec
|
|
43
|
+
_MAX_BOOT_TIME = 1200 # sec
|
|
44
|
+
_ACTION_WATCH_MAX_RETRY = 360 # number of polls before giving up
|
|
45
|
+
_ACTION_WATCH_FETCH_EVERY = 5 # seconds between polls
|
|
46
|
+
_API_RETRY_MAX_RETRIES = 5
|
|
47
|
+
_API_RETRY_INITIAL_BACKOFF = 1
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# --------------------------------------------------------------------------- #
|
|
51
|
+
# Class required by the Ray backend
|
|
52
|
+
# --------------------------------------------------------------------------- #
|
|
53
|
+
class SeewebNodeProvider:
|
|
54
|
+
"""Minimalist provisioner for Seeweb ECS."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, provider_config: ProvisionConfig, cluster_name: str):
|
|
57
|
+
"""provider_config: dict populated by template (plan, image, location,
|
|
58
|
+
remote_key_name, optional gpu…)
|
|
59
|
+
cluster_name : SkyPilot name on cloud (used in notes)
|
|
60
|
+
"""
|
|
61
|
+
self.config = provider_config
|
|
62
|
+
self.cluster_name = cluster_name
|
|
63
|
+
# Reuse a singleton Seeweb client to avoid repeated authentications/API
|
|
64
|
+
# object creations across different provider instances.
|
|
65
|
+
self.ecs = _get_seeweb_client()
|
|
66
|
+
|
|
67
|
+
def _get_ssh_user(self) -> str:
|
|
68
|
+
# Prefer auth config; fallback to template default for Seeweb
|
|
69
|
+
return (self.config.authentication_config.get('ssh_user') if self.config
|
|
70
|
+
and self.config.authentication_config else None) or 'ecuser'
|
|
71
|
+
|
|
72
|
+
def _get_private_key_path(self) -> str:
|
|
73
|
+
# Prefer explicit path from auth config; otherwise use SkyPilot key
|
|
74
|
+
key_path = None
|
|
75
|
+
if self.config and self.config.authentication_config:
|
|
76
|
+
key_path = self.config.authentication_config.get('ssh_private_key')
|
|
77
|
+
if not key_path:
|
|
78
|
+
key_path, _ = auth_utils.get_or_generate_keys()
|
|
79
|
+
return os.path.expanduser(key_path)
|
|
80
|
+
|
|
81
|
+
# ------------------------------------------------------------------ #
|
|
82
|
+
# Helper: run a command on the VM via SSH using CommandRunner
|
|
83
|
+
# ------------------------------------------------------------------ #
|
|
84
|
+
def _run_remote(self,
|
|
85
|
+
server_ip: str,
|
|
86
|
+
cmd: str,
|
|
87
|
+
*,
|
|
88
|
+
timeout: int = 30,
|
|
89
|
+
stream_logs: bool = False) -> subprocess.CompletedProcess:
|
|
90
|
+
"""Execute *cmd* on the remote host.
|
|
91
|
+
|
|
92
|
+
Uses sky.utils.command_runner.SSHCommandRunner for consistent SSH
|
|
93
|
+
options across all providers.
|
|
94
|
+
Returns a subprocess.CompletedProcess-like
|
|
95
|
+
object with returncode, stdout, stderr.
|
|
96
|
+
"""
|
|
97
|
+
runner = command_runner.SSHCommandRunner(
|
|
98
|
+
node=(server_ip, 22),
|
|
99
|
+
ssh_user=self._get_ssh_user(),
|
|
100
|
+
ssh_private_key=self._get_private_key_path(),
|
|
101
|
+
)
|
|
102
|
+
rc, stdout, stderr = runner.run(cmd,
|
|
103
|
+
stream_logs=stream_logs,
|
|
104
|
+
require_outputs=True,
|
|
105
|
+
connect_timeout=timeout)
|
|
106
|
+
# Convert to simple namespace for compatibility
|
|
107
|
+
proc = subprocess.CompletedProcess(args=cmd,
|
|
108
|
+
returncode=rc,
|
|
109
|
+
stdout=stdout.encode(),
|
|
110
|
+
stderr=stderr.encode())
|
|
111
|
+
return proc
|
|
112
|
+
|
|
113
|
+
# --------------------------------------------------------------------- #
|
|
114
|
+
# 1. bootstrap_instances – no preprocessing needed here
|
|
115
|
+
# --------------------------------------------------------------------- #
|
|
116
|
+
|
|
117
|
+
# --------------------------------------------------------------------- #
|
|
118
|
+
# 2. run_instances: restart or create until we reach count
|
|
119
|
+
# --------------------------------------------------------------------- #
|
|
120
|
+
def run_instances(self, config: Dict, count: int) -> None:
|
|
121
|
+
existing = self._query_cluster_nodes()
|
|
122
|
+
del config # unused
|
|
123
|
+
running = [
|
|
124
|
+
s for s in existing if s.status in ('Booted', 'Running', 'RUNNING',
|
|
125
|
+
'Booting', 'PoweringOn')
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
# a) restart Off servers
|
|
129
|
+
for srv in (s for s in existing if s.status == 'Booted'):
|
|
130
|
+
specific_status = self.ecs.fetch_server_status(srv.name)
|
|
131
|
+
if specific_status == 'SHUTOFF':
|
|
132
|
+
logger.info(f'Powering on server {srv.name}')
|
|
133
|
+
self._power_on(srv.name)
|
|
134
|
+
running.append(srv)
|
|
135
|
+
if len(running) >= count:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
# b) create new VMs if missing
|
|
139
|
+
while len(running) < count:
|
|
140
|
+
self._create_server()
|
|
141
|
+
running.append({}) # placeholder
|
|
142
|
+
|
|
143
|
+
# --------------------------------------------------------------------- #
|
|
144
|
+
# 3. terminate_instances
|
|
145
|
+
# --------------------------------------------------------------------- #
|
|
146
|
+
def terminate_instances(self) -> None:
|
|
147
|
+
for srv in self._query_cluster_nodes():
|
|
148
|
+
logger.info('Deleting server %s …', srv.name)
|
|
149
|
+
self.ecs.delete_server(srv.name) # DELETE /servers/{name}
|
|
150
|
+
|
|
151
|
+
# Retry deletion with exponential backoff
|
|
152
|
+
# to handle transient API errors
|
|
153
|
+
common_utils.retry(self.ecs.delete_server,
|
|
154
|
+
max_retries=5,
|
|
155
|
+
initial_backoff=1)(srv.name)
|
|
156
|
+
|
|
157
|
+
# --------------------------------------------------------------------- #
|
|
158
|
+
# 4. stop_instances
|
|
159
|
+
# --------------------------------------------------------------------- #
|
|
160
|
+
def stop_instances(self) -> None:
|
|
161
|
+
cluster_nodes = self._query_cluster_nodes()
|
|
162
|
+
|
|
163
|
+
for srv in cluster_nodes:
|
|
164
|
+
specific_status = self.ecs.fetch_server_status(srv.name)
|
|
165
|
+
|
|
166
|
+
if specific_status == 'SHUTOFF':
|
|
167
|
+
logger.info(f'\nServer {srv.name} is already stopped\n')
|
|
168
|
+
continue
|
|
169
|
+
elif srv.status in ('Booted', 'Running', 'RUNNING'):
|
|
170
|
+
# Get specific status to check if server is not already SHUTOFF
|
|
171
|
+
try:
|
|
172
|
+
specific_status = self.ecs.fetch_server_status(srv.name)
|
|
173
|
+
# Continue with power off only if
|
|
174
|
+
# specific_status is not SHUTOFF
|
|
175
|
+
# and general status is not STOPPED
|
|
176
|
+
if specific_status != 'SHUTOFF' and srv.status != 'STOPPED':
|
|
177
|
+
self._power_off(srv.name)
|
|
178
|
+
except Exception: # pylint: disable=broad-except
|
|
179
|
+
# Fallback: if we can't get specific
|
|
180
|
+
# status, use general status check
|
|
181
|
+
if srv.status != 'STOPPED':
|
|
182
|
+
self._power_off(srv.name)
|
|
183
|
+
else:
|
|
184
|
+
logger.info(f'\nServer {srv.name} has status'
|
|
185
|
+
f'{srv.status}, skipping\n')
|
|
186
|
+
# Wait for all servers to be actually stopped with forced refresh
|
|
187
|
+
self._wait_for_stop_with_forced_refresh()
|
|
188
|
+
|
|
189
|
+
# --------------------------------------------------------------------- #
|
|
190
|
+
# 5. query_instances
|
|
191
|
+
# --------------------------------------------------------------------- #
|
|
192
|
+
def query_instances(self) -> Dict[str, str]:
|
|
193
|
+
"""Query instances status using both fetch_servers()
|
|
194
|
+
and fetch_server_status().
|
|
195
|
+
|
|
196
|
+
Seeweb has two different APIs:
|
|
197
|
+
- fetch_servers() returns states like 'Booted', 'Booting'
|
|
198
|
+
- fetch_server_status() returns states like 'SHUTOFF' (stopped)
|
|
199
|
+
|
|
200
|
+
We need to use fetch_server_status() to get the correct stopped state.
|
|
201
|
+
"""
|
|
202
|
+
instances = {}
|
|
203
|
+
cluster_nodes = self._query_cluster_nodes()
|
|
204
|
+
|
|
205
|
+
for server in cluster_nodes:
|
|
206
|
+
# Always try to get the specific status first for more accuracy
|
|
207
|
+
try:
|
|
208
|
+
specific_status = self.ecs.fetch_server_status(server.name)
|
|
209
|
+
instances[server.name] = specific_status
|
|
210
|
+
except Exception: # pylint: disable=broad-except
|
|
211
|
+
# Fallback to general status if fetch_server_status fails
|
|
212
|
+
general_status = server.status
|
|
213
|
+
instances[server.name] = general_status
|
|
214
|
+
|
|
215
|
+
return instances
|
|
216
|
+
|
|
217
|
+
# --------------------------------------------------------------------- #
|
|
218
|
+
# 6. wait_instances
|
|
219
|
+
# --------------------------------------------------------------------- #
|
|
220
|
+
def wait_instances(self, desired_state: str = 'Booted') -> None:
|
|
221
|
+
deadline = time.time() + _MAX_BOOT_TIME
|
|
222
|
+
|
|
223
|
+
while time.time() < deadline:
|
|
224
|
+
cluster_nodes = self._query_cluster_nodes()
|
|
225
|
+
|
|
226
|
+
# For SHUTOFF state, we need to use fetch_server_status()
|
|
227
|
+
# to get the real status
|
|
228
|
+
if desired_state == 'SHUTOFF':
|
|
229
|
+
all_shutoff = True
|
|
230
|
+
for server in cluster_nodes:
|
|
231
|
+
try:
|
|
232
|
+
specific_status = self.ecs.fetch_server_status(
|
|
233
|
+
server.name)
|
|
234
|
+
if specific_status != 'SHUTOFF':
|
|
235
|
+
all_shutoff = False
|
|
236
|
+
except Exception: # pylint: disable=broad-except
|
|
237
|
+
all_shutoff = False
|
|
238
|
+
|
|
239
|
+
if all_shutoff:
|
|
240
|
+
return
|
|
241
|
+
else:
|
|
242
|
+
# For other states, use the general status
|
|
243
|
+
states = {srv.status for srv in cluster_nodes}
|
|
244
|
+
|
|
245
|
+
if states <= {desired_state}:
|
|
246
|
+
# If all servers are Booted, wait
|
|
247
|
+
# for them to be truly stable
|
|
248
|
+
if desired_state == 'Booted':
|
|
249
|
+
if self._wait_for_all_servers_stable():
|
|
250
|
+
return
|
|
251
|
+
else:
|
|
252
|
+
time.sleep(_POLL_INTERVAL)
|
|
253
|
+
continue
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
time.sleep(_POLL_INTERVAL)
|
|
257
|
+
|
|
258
|
+
raise TimeoutError(
|
|
259
|
+
f'Nodes are not all in state {desired_state} within timeout')
|
|
260
|
+
|
|
261
|
+
def _wait_for_all_servers_stable(self, max_wait: int = 600) -> bool:
|
|
262
|
+
"""Waits for all cluster servers to be stable."""
|
|
263
|
+
logger.info('Checking stability of all cluster servers...')
|
|
264
|
+
|
|
265
|
+
start_time = time.time()
|
|
266
|
+
while time.time() - start_time < max_wait:
|
|
267
|
+
cluster_nodes = self._query_cluster_nodes()
|
|
268
|
+
all_stable = True
|
|
269
|
+
|
|
270
|
+
for node in cluster_nodes:
|
|
271
|
+
if node.status == 'Booted':
|
|
272
|
+
# Check that server is reachable via ping
|
|
273
|
+
if not self._ping_server(node.ipv4):
|
|
274
|
+
logger.warning(f'Server {node.name} ({node.ipv4})'
|
|
275
|
+
f'not reachable via ping')
|
|
276
|
+
all_stable = False
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
# SSH readiness handled by provisioner.wait_for_ssh()
|
|
280
|
+
|
|
281
|
+
logger.info(f'Server {node.name} ({node.ipv4}) is stable')
|
|
282
|
+
|
|
283
|
+
if all_stable:
|
|
284
|
+
logger.info('All servers are stable')
|
|
285
|
+
# Safety sleep to allow for late reboots
|
|
286
|
+
logger.info('Waiting 1 second to allow for late reboots...')
|
|
287
|
+
time.sleep(1)
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
logger.info('Waiting for all servers to be stable...')
|
|
291
|
+
time.sleep(1)
|
|
292
|
+
|
|
293
|
+
logger.error('Timeout waiting for server stability')
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
def _ping_server(self, server_ip: str) -> bool:
|
|
297
|
+
"""Check that server is reachable via ping."""
|
|
298
|
+
try:
|
|
299
|
+
result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
|
|
300
|
+
capture_output=True,
|
|
301
|
+
timeout=10,
|
|
302
|
+
check=False)
|
|
303
|
+
return result.returncode == 0
|
|
304
|
+
except Exception as e: # pylint: disable=broad-except
|
|
305
|
+
logger.debug(f'Error pinging {server_ip}: {e}')
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
def _check_ssh_ready(self, server_ip: str) -> bool:
|
|
309
|
+
"""Check that SSH is available on the server."""
|
|
310
|
+
try:
|
|
311
|
+
ssh_user = self._get_ssh_user()
|
|
312
|
+
private_key_path = self._get_private_key_path()
|
|
313
|
+
result = subprocess.run([
|
|
314
|
+
'ssh', '-o', 'ConnectTimeout=10', '-o',
|
|
315
|
+
'StrictHostKeyChecking=no', '-o',
|
|
316
|
+
f'UserKnownHostsFile={os.devnull}', '-o',
|
|
317
|
+
f'GlobalKnownHostsFile={os.devnull}', '-o',
|
|
318
|
+
'IdentitiesOnly=yes', '-i', private_key_path,
|
|
319
|
+
f'{ssh_user}@{server_ip}', 'echo "SSH ready"'
|
|
320
|
+
],
|
|
321
|
+
capture_output=True,
|
|
322
|
+
timeout=15,
|
|
323
|
+
check=False)
|
|
324
|
+
return result.returncode == 0
|
|
325
|
+
except Exception as e: # pylint: disable=broad-except
|
|
326
|
+
logger.debug(f'Error checking SSH on {server_ip}: {e}')
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
# ------------------------------------------------------------------ #
|
|
330
|
+
# 7. open_ports / cleanup_ports – Seeweb has all ports open by default
|
|
331
|
+
# ------------------------------------------------------------------ #
|
|
332
|
+
def open_ports(
|
|
333
|
+
self,
|
|
334
|
+
cluster_name_on_cloud: str,
|
|
335
|
+
ports: List[str],
|
|
336
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
337
|
+
) -> None:
|
|
338
|
+
"""See sky/provision/__init__.py"""
|
|
339
|
+
logger.debug(f'Skip opening ports {ports} for Seeweb instances, as all '
|
|
340
|
+
'ports are open by default.')
|
|
341
|
+
del cluster_name_on_cloud, provider_config, ports
|
|
342
|
+
|
|
343
|
+
def cleanup_ports(
|
|
344
|
+
self,
|
|
345
|
+
cluster_name_on_cloud: str,
|
|
346
|
+
ports: List[str],
|
|
347
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
348
|
+
) -> None:
|
|
349
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
|
350
|
+
|
|
351
|
+
# ====================== private helpers ========================= #
|
|
352
|
+
def _query_cluster_nodes(self):
|
|
353
|
+
"""List servers with notes == cluster_name."""
|
|
354
|
+
servers = common_utils.retry(
|
|
355
|
+
self.ecs.fetch_servers,
|
|
356
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
357
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
|
|
358
|
+
return [
|
|
359
|
+
s for s in servers
|
|
360
|
+
if s.notes and s.notes.startswith(self.cluster_name)
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
def query_cluster_nodes(self):
|
|
364
|
+
"""Public wrapper for querying cluster nodes for this cluster."""
|
|
365
|
+
return common_utils.retry(self._query_cluster_nodes,
|
|
366
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
367
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
|
|
368
|
+
|
|
369
|
+
def _get_head_instance_id(self) -> Optional[str]:
|
|
370
|
+
"""Return head instance id for this cluster.
|
|
371
|
+
|
|
372
|
+
Prefer notes == "{cluster}-head"; fallback to first node if none
|
|
373
|
+
matches (legacy naming).
|
|
374
|
+
"""
|
|
375
|
+
nodes = common_utils.retry(self._query_cluster_nodes,
|
|
376
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
377
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
|
|
378
|
+
for node in nodes:
|
|
379
|
+
try:
|
|
380
|
+
if getattr(node, 'notes', None) == f'{self.cluster_name}-head':
|
|
381
|
+
return node.name
|
|
382
|
+
if getattr(node, 'name', None) and node.name.endswith('-head'):
|
|
383
|
+
return node.name
|
|
384
|
+
except Exception: # pylint: disable=broad-except
|
|
385
|
+
continue
|
|
386
|
+
return nodes[0].name if nodes else None
|
|
387
|
+
|
|
388
|
+
def get_head_instance_id(self) -> Optional[str]:
|
|
389
|
+
"""Public wrapper for getting head instance id."""
|
|
390
|
+
return common_utils.retry(self._get_head_instance_id,
|
|
391
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
392
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
|
|
393
|
+
|
|
394
|
+
def _create_server(self):
|
|
395
|
+
"""POST /servers with complete payload."""
|
|
396
|
+
node_type = 'head'
|
|
397
|
+
payload = {
|
|
398
|
+
'plan': self.config.node_config.get('plan'), # e.g. eCS4
|
|
399
|
+
'image': self.config.node_config.get('image'), # e.g. ubuntu-2204
|
|
400
|
+
'location': self.config.node_config.get('location'), # e.g. it-mi2
|
|
401
|
+
'notes': f'{self.cluster_name}-{node_type}',
|
|
402
|
+
'ssh_key': self.config.authentication_config.get('remote_key_name'
|
|
403
|
+
), # remote key
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
# Optional GPU
|
|
407
|
+
if 'gpu' in self.config.node_config:
|
|
408
|
+
payload.update({
|
|
409
|
+
'gpu': self.config.node_config.get('gpu'),
|
|
410
|
+
'gpu_label': self.config.node_config.get('gpu_label', ''),
|
|
411
|
+
})
|
|
412
|
+
|
|
413
|
+
# Build the request object expected by ecsapi
|
|
414
|
+
server_create_request_cls = (
|
|
415
|
+
seeweb_adaptor.ecsapi.ServerCreateRequest # type: ignore
|
|
416
|
+
)
|
|
417
|
+
create_request = server_create_request_cls(**payload)
|
|
418
|
+
|
|
419
|
+
logger.info('Creating Seeweb server %s', payload)
|
|
420
|
+
|
|
421
|
+
# POST /servers – returns (response, action_id)
|
|
422
|
+
_, action_id = common_utils.retry(
|
|
423
|
+
self.ecs.create_server,
|
|
424
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
425
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)(
|
|
426
|
+
create_request, check_if_can_create=False)
|
|
427
|
+
self.ecs.watch_action(action_id,
|
|
428
|
+
max_retry=_ACTION_WATCH_MAX_RETRY,
|
|
429
|
+
fetch_every=_ACTION_WATCH_FETCH_EVERY)
|
|
430
|
+
|
|
431
|
+
def _power_on(self, server_id: str):
|
|
432
|
+
try:
|
|
433
|
+
common_utils.retry(
|
|
434
|
+
self.ecs.turn_on_server,
|
|
435
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
436
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
|
|
437
|
+
except seeweb_adaptor.SeewebError as e:
|
|
438
|
+
logger.error(f'Error in _power_on for {server_id}: {e}')
|
|
439
|
+
raise
|
|
440
|
+
|
|
441
|
+
def _power_off(self, server_id: str):
|
|
442
|
+
try:
|
|
443
|
+
common_utils.retry(
|
|
444
|
+
self.ecs.turn_off_server,
|
|
445
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
446
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
|
|
447
|
+
except seeweb_adaptor.SeewebError as e:
|
|
448
|
+
logger.error(f'\n\nError in _power_off for {server_id}: {e}')
|
|
449
|
+
raise
|
|
450
|
+
|
|
451
|
+
def _wait_action(self, action_id: int):
|
|
452
|
+
"""Poll action until it completes."""
|
|
453
|
+
while True:
|
|
454
|
+
action = common_utils.retry(
|
|
455
|
+
self.ecs.fetch_action,
|
|
456
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
457
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)(action_id)
|
|
458
|
+
if action['status'] in ('completed', 'ok', 'no_content'):
|
|
459
|
+
return
|
|
460
|
+
if action['status'] == 'error':
|
|
461
|
+
raise RuntimeError(f'Seeweb action {action_id} failed')
|
|
462
|
+
time.sleep(_POLL_INTERVAL)
|
|
463
|
+
|
|
464
|
+
def _wait_for_stop_with_forced_refresh(self, max_wait: int = 300) -> None:
|
|
465
|
+
"""Wait for servers to be stopped with
|
|
466
|
+
aggressive polling and forced refresh."""
|
|
467
|
+
start_time = time.time()
|
|
468
|
+
poll_interval = 1 # 1 second for aggressive polling
|
|
469
|
+
|
|
470
|
+
while time.time() - start_time < max_wait:
|
|
471
|
+
# Force refresh by re-fetching cluster nodes
|
|
472
|
+
cluster_nodes = common_utils.retry(
|
|
473
|
+
self._query_cluster_nodes,
|
|
474
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
475
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
|
|
476
|
+
|
|
477
|
+
all_stopped = True
|
|
478
|
+
for server in cluster_nodes:
|
|
479
|
+
try:
|
|
480
|
+
# Always use fetch_server_status() for accurate status
|
|
481
|
+
specific_status = common_utils.retry(
|
|
482
|
+
self.ecs.fetch_server_status,
|
|
483
|
+
max_retries=_API_RETRY_MAX_RETRIES,
|
|
484
|
+
initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server.name)
|
|
485
|
+
|
|
486
|
+
if specific_status != 'SHUTOFF':
|
|
487
|
+
all_stopped = False
|
|
488
|
+
|
|
489
|
+
except Exception: # pylint: disable=broad-except
|
|
490
|
+
all_stopped = False
|
|
491
|
+
|
|
492
|
+
if all_stopped:
|
|
493
|
+
return
|
|
494
|
+
|
|
495
|
+
time.sleep(poll_interval)
|
|
496
|
+
|
|
497
|
+
raise TimeoutError(f'Servers not stopped within {max_wait} seconds')
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# =============================================================================
|
|
501
|
+
# Standalone functions required by the provisioning interface
|
|
502
|
+
# =============================================================================
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
506
|
+
config: ProvisionConfig) -> ProvisionRecord:
|
|
507
|
+
"""Run instances for Seeweb cluster."""
|
|
508
|
+
del cluster_name # unused
|
|
509
|
+
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
510
|
+
provider.run_instances(config.node_config, config.count)
|
|
511
|
+
|
|
512
|
+
# Find the head node using notes convention
|
|
513
|
+
cluster_nodes = provider.query_cluster_nodes()
|
|
514
|
+
if not cluster_nodes:
|
|
515
|
+
raise RuntimeError(
|
|
516
|
+
f'No nodes found for cluster {cluster_name_on_cloud}')
|
|
517
|
+
head_node_id = provider.get_head_instance_id()
|
|
518
|
+
assert head_node_id is not None, 'head_instance_id should not be None'
|
|
519
|
+
|
|
520
|
+
return ProvisionRecord(
|
|
521
|
+
provider_name='Seeweb',
|
|
522
|
+
region=region,
|
|
523
|
+
zone=None, # Seeweb doesn't use zones
|
|
524
|
+
cluster_name=cluster_name_on_cloud,
|
|
525
|
+
head_instance_id=head_node_id,
|
|
526
|
+
resumed_instance_ids=[], # Empty for now
|
|
527
|
+
created_instance_ids=[node.name for node in cluster_nodes],
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def stop_instances(
|
|
532
|
+
cluster_name_on_cloud: str,
|
|
533
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
534
|
+
worker_only: bool = False,
|
|
535
|
+
) -> None:
|
|
536
|
+
"""Stop instances for Seeweb cluster."""
|
|
537
|
+
del worker_only # unused - Seeweb doesn't distinguish between head/worker
|
|
538
|
+
assert provider_config is not None
|
|
539
|
+
|
|
540
|
+
# Convert Dict to ProvisionConfig for SeewebNodeProvider
|
|
541
|
+
config = common.ProvisionConfig(
|
|
542
|
+
provider_config=provider_config,
|
|
543
|
+
authentication_config={},
|
|
544
|
+
docker_config={},
|
|
545
|
+
node_config=provider_config,
|
|
546
|
+
count=1, # Not used for stop operation
|
|
547
|
+
tags={},
|
|
548
|
+
resume_stopped_nodes=False,
|
|
549
|
+
ports_to_open_on_launch=None,
|
|
550
|
+
)
|
|
551
|
+
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
552
|
+
provider.stop_instances()
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def terminate_instances(
|
|
556
|
+
cluster_name_on_cloud: str,
|
|
557
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
558
|
+
worker_only: bool = False,
|
|
559
|
+
) -> None:
|
|
560
|
+
"""Terminate instances for Seeweb cluster."""
|
|
561
|
+
del worker_only # unused - Seeweb doesn't distinguish between head/worker
|
|
562
|
+
assert provider_config is not None
|
|
563
|
+
# Convert Dict to ProvisionConfig for SeewebNodeProvider
|
|
564
|
+
config = common.ProvisionConfig(
|
|
565
|
+
provider_config=provider_config,
|
|
566
|
+
authentication_config={},
|
|
567
|
+
docker_config={},
|
|
568
|
+
node_config=provider_config,
|
|
569
|
+
count=1, # Not used for terminate operation
|
|
570
|
+
tags={},
|
|
571
|
+
resume_stopped_nodes=False,
|
|
572
|
+
ports_to_open_on_launch=None,
|
|
573
|
+
)
|
|
574
|
+
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
575
|
+
provider.terminate_instances()
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def wait_instances(
|
|
579
|
+
region: str,
|
|
580
|
+
cluster_name_on_cloud: str,
|
|
581
|
+
state: Optional[status_lib.ClusterStatus],
|
|
582
|
+
) -> None:
|
|
583
|
+
del region # unused
|
|
584
|
+
# Map ClusterStatus to Seeweb string
|
|
585
|
+
if state == status_lib.ClusterStatus.UP:
|
|
586
|
+
seeweb_state = 'Booted'
|
|
587
|
+
elif state == status_lib.ClusterStatus.STOPPED:
|
|
588
|
+
seeweb_state = 'SHUTOFF'
|
|
589
|
+
elif state is None:
|
|
590
|
+
seeweb_state = 'Terminated' # For termination
|
|
591
|
+
else:
|
|
592
|
+
seeweb_state = 'Booted' # Default fallback
|
|
593
|
+
|
|
594
|
+
# Create Seeweb client directly and wait
|
|
595
|
+
client = _get_seeweb_client()
|
|
596
|
+
deadline = time.time() + _MAX_BOOT_TIME
|
|
597
|
+
while time.time() < deadline:
|
|
598
|
+
cluster_nodes = [
|
|
599
|
+
s for s in client.fetch_servers()
|
|
600
|
+
if s.notes and s.notes.startswith(cluster_name_on_cloud)
|
|
601
|
+
]
|
|
602
|
+
if not cluster_nodes:
|
|
603
|
+
time.sleep(_POLL_INTERVAL)
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
states = {srv.status for srv in cluster_nodes}
|
|
607
|
+
if states <= {seeweb_state}:
|
|
608
|
+
# If all servers are Booted, wait for them to be truly stable
|
|
609
|
+
if seeweb_state == 'Booted':
|
|
610
|
+
if _wait_for_all_servers_stable_standalone(cluster_nodes):
|
|
611
|
+
return
|
|
612
|
+
else:
|
|
613
|
+
time.sleep(_POLL_INTERVAL)
|
|
614
|
+
continue
|
|
615
|
+
return
|
|
616
|
+
time.sleep(_POLL_INTERVAL)
|
|
617
|
+
|
|
618
|
+
raise TimeoutError(
|
|
619
|
+
f'Nodes are not all in state {seeweb_state} within timeout')
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _wait_for_all_servers_stable_standalone(cluster_nodes,
|
|
623
|
+
max_wait: int = 300) -> bool:
|
|
624
|
+
"""Waits for all cluster servers to be stable (standalone version)."""
|
|
625
|
+
start_time = time.time()
|
|
626
|
+
while time.time() - start_time < max_wait:
|
|
627
|
+
all_stable = True
|
|
628
|
+
|
|
629
|
+
for node in cluster_nodes:
|
|
630
|
+
if node.status == 'Booted':
|
|
631
|
+
# Check that server is reachable via ping
|
|
632
|
+
if not _ping_server_standalone(node.ipv4):
|
|
633
|
+
all_stable = False
|
|
634
|
+
break
|
|
635
|
+
|
|
636
|
+
# Do not check SSH here; handled by provisioner.wait_for_ssh().
|
|
637
|
+
|
|
638
|
+
if all_stable:
|
|
639
|
+
# Safety sleep to allow for late reboots
|
|
640
|
+
time.sleep(1)
|
|
641
|
+
return True
|
|
642
|
+
|
|
643
|
+
time.sleep(1)
|
|
644
|
+
|
|
645
|
+
return False
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _ping_server_standalone(server_ip: str) -> bool:
|
|
649
|
+
"""Check that server is reachable via ping (standalone version)."""
|
|
650
|
+
try:
|
|
651
|
+
result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
|
|
652
|
+
capture_output=True,
|
|
653
|
+
timeout=10,
|
|
654
|
+
check=False)
|
|
655
|
+
return result.returncode == 0
|
|
656
|
+
except Exception as e: # pylint: disable=broad-except
|
|
657
|
+
logger.error(f'Error pinging {server_ip}: {e}')
|
|
658
|
+
return False
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _check_ssh_ready_standalone(server_ip: str) -> bool:
|
|
662
|
+
"""Check that SSH is available on the server (standalone version)."""
|
|
663
|
+
try:
|
|
664
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
665
|
+
private_key_path = os.path.expanduser(private_key_path)
|
|
666
|
+
ssh_user = 'ecuser'
|
|
667
|
+
result = subprocess.run([
|
|
668
|
+
'ssh', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=no',
|
|
669
|
+
'-o', f'UserKnownHostsFile={os.devnull}', '-o',
|
|
670
|
+
f'GlobalKnownHostsFile={os.devnull}', '-o', 'IdentitiesOnly=yes',
|
|
671
|
+
'-i', private_key_path, f'{ssh_user}@{server_ip}',
|
|
672
|
+
'echo "SSH ready"'
|
|
673
|
+
],
|
|
674
|
+
capture_output=True,
|
|
675
|
+
timeout=15,
|
|
676
|
+
check=False)
|
|
677
|
+
return result.returncode == 0
|
|
678
|
+
except Exception: # pylint: disable=broad-except
|
|
679
|
+
return False
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def query_instances(
|
|
683
|
+
cluster_name: str,
|
|
684
|
+
cluster_name_on_cloud: str,
|
|
685
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
686
|
+
non_terminated_only: bool = True,
|
|
687
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
688
|
+
"""Query instances status for Seeweb cluster."""
|
|
689
|
+
del cluster_name # unused
|
|
690
|
+
# Use the provided provider_config or default to empty dict
|
|
691
|
+
if provider_config is None:
|
|
692
|
+
provider_config = {}
|
|
693
|
+
|
|
694
|
+
# Convert Dict to ProvisionConfig for SeewebNodeProvider
|
|
695
|
+
config = common.ProvisionConfig(
|
|
696
|
+
provider_config=provider_config,
|
|
697
|
+
authentication_config={},
|
|
698
|
+
docker_config={},
|
|
699
|
+
node_config=provider_config,
|
|
700
|
+
count=1, # Not used for query operation
|
|
701
|
+
tags={},
|
|
702
|
+
resume_stopped_nodes=False,
|
|
703
|
+
ports_to_open_on_launch=None,
|
|
704
|
+
)
|
|
705
|
+
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
706
|
+
seeweb_instances = provider.query_instances()
|
|
707
|
+
|
|
708
|
+
# Map Seeweb status to SkyPilot status
|
|
709
|
+
status_map = {
|
|
710
|
+
'Booted':
|
|
711
|
+
status_lib.ClusterStatus.UP, # Seeweb uses "Booted" for running
|
|
712
|
+
'RUNNING': status_lib.ClusterStatus.UP, # All caps version
|
|
713
|
+
'Booting': status_lib.ClusterStatus.INIT,
|
|
714
|
+
'PoweringOn': status_lib.ClusterStatus.INIT,
|
|
715
|
+
'Off': status_lib.ClusterStatus.STOPPED,
|
|
716
|
+
'Stopped': status_lib.ClusterStatus.STOPPED,
|
|
717
|
+
'SHUTOFF':
|
|
718
|
+
status_lib.ClusterStatus.STOPPED, # Add missing SHUTOFF status
|
|
719
|
+
'PoweringOff': status_lib.ClusterStatus.
|
|
720
|
+
STOPPED, # Fixed: should be STOPPED, not INIT
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
result: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
|
|
724
|
+
Optional[str]]] = {}
|
|
725
|
+
for name, seeweb_status in seeweb_instances.items():
|
|
726
|
+
if non_terminated_only and seeweb_status in ('Terminated', 'Deleted'):
|
|
727
|
+
continue
|
|
728
|
+
mapped_status = status_map.get(seeweb_status,
|
|
729
|
+
status_lib.ClusterStatus.INIT)
|
|
730
|
+
# Return tuple of (status, reason) where reason is None for Seeweb
|
|
731
|
+
result[name] = (mapped_status, None)
|
|
732
|
+
|
|
733
|
+
return result
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
# Signature should not include provider_name; router strips it before calling
|
|
737
|
+
def get_cluster_info(
|
|
738
|
+
region: str,
|
|
739
|
+
cluster_name_on_cloud: str,
|
|
740
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
741
|
+
) -> 'ClusterInfo':
|
|
742
|
+
del region # unused
|
|
743
|
+
# Use Seeweb client to get cluster instances
|
|
744
|
+
client = _get_seeweb_client()
|
|
745
|
+
cluster_nodes = [
|
|
746
|
+
s for s in client.fetch_servers()
|
|
747
|
+
if s.notes and s.notes.startswith(cluster_name_on_cloud)
|
|
748
|
+
]
|
|
749
|
+
|
|
750
|
+
if not cluster_nodes:
|
|
751
|
+
raise RuntimeError(
|
|
752
|
+
f'No instances found for cluster {cluster_name_on_cloud}')
|
|
753
|
+
|
|
754
|
+
instances = {}
|
|
755
|
+
head_instance = None
|
|
756
|
+
for node in cluster_nodes:
|
|
757
|
+
if getattr(node, 'notes', None) == f'{cluster_name_on_cloud}-head':
|
|
758
|
+
head_instance = node.name
|
|
759
|
+
break
|
|
760
|
+
if head_instance is None:
|
|
761
|
+
head_instance = cluster_nodes[0].name
|
|
762
|
+
|
|
763
|
+
for node in cluster_nodes:
|
|
764
|
+
# For Seeweb, we take the first node as head
|
|
765
|
+
if head_instance is None:
|
|
766
|
+
head_instance = node.name
|
|
767
|
+
|
|
768
|
+
# Get server IP (Seeweb uses 'ipv4' attribute)
|
|
769
|
+
external_ip = node.ipv4
|
|
770
|
+
internal_ip = external_ip # For Seeweb, internal IP = external IP
|
|
771
|
+
|
|
772
|
+
instances[node.name] = [
|
|
773
|
+
InstanceInfo(
|
|
774
|
+
instance_id=node.name,
|
|
775
|
+
internal_ip=internal_ip,
|
|
776
|
+
external_ip=external_ip,
|
|
777
|
+
ssh_port=22,
|
|
778
|
+
tags={},
|
|
779
|
+
)
|
|
780
|
+
]
|
|
781
|
+
|
|
782
|
+
return ClusterInfo(
|
|
783
|
+
instances=instances,
|
|
784
|
+
head_instance_id=head_instance,
|
|
785
|
+
provider_name='Seeweb',
|
|
786
|
+
provider_config=provider_config,
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def open_ports(
|
|
791
|
+
cluster_name_on_cloud: str,
|
|
792
|
+
ports: List[str],
|
|
793
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
794
|
+
) -> None:
|
|
795
|
+
del provider_config # Unused
|
|
796
|
+
logger.debug(f'Seeweb: skipping open_ports for {cluster_name_on_cloud}'
|
|
797
|
+
f'ports={ports} all ports are open by default')
|
|
798
|
+
return
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def cleanup_ports(
|
|
802
|
+
cluster_name_on_cloud: str,
|
|
803
|
+
ports: List[str],
|
|
804
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
805
|
+
) -> None:
|
|
806
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
|
807
|
+
return
|