skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/core.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
2
|
import typing
|
|
5
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
4
|
|
|
@@ -8,7 +6,7 @@ import colorama
|
|
|
8
6
|
|
|
9
7
|
from sky import admin_policy
|
|
10
8
|
from sky import backends
|
|
11
|
-
from sky import
|
|
9
|
+
from sky import catalog
|
|
12
10
|
from sky import clouds
|
|
13
11
|
from sky import dag as dag_lib
|
|
14
12
|
from sky import data
|
|
@@ -17,21 +15,26 @@ from sky import global_user_state
|
|
|
17
15
|
from sky import models
|
|
18
16
|
from sky import optimizer
|
|
19
17
|
from sky import sky_logging
|
|
18
|
+
from sky import skypilot_config
|
|
20
19
|
from sky import task as task_lib
|
|
20
|
+
from sky.adaptors import common as adaptors_common
|
|
21
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.backends import cloud_vm_ray_backend
|
|
22
23
|
from sky.clouds import cloud as sky_cloud
|
|
23
|
-
from sky.clouds import service_catalog
|
|
24
24
|
from sky.jobs.server import core as managed_jobs_core
|
|
25
25
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
26
26
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
27
|
+
from sky.schemas.api import responses
|
|
28
|
+
from sky.server.requests import request_names
|
|
29
|
+
from sky.skylet import autostop_lib
|
|
27
30
|
from sky.skylet import constants
|
|
28
31
|
from sky.skylet import job_lib
|
|
29
|
-
from sky.skylet import log_lib
|
|
30
32
|
from sky.usage import usage_lib
|
|
31
33
|
from sky.utils import admin_policy_utils
|
|
32
34
|
from sky.utils import common
|
|
33
35
|
from sky.utils import common_utils
|
|
34
36
|
from sky.utils import controller_utils
|
|
37
|
+
from sky.utils import resources_utils
|
|
35
38
|
from sky.utils import rich_utils
|
|
36
39
|
from sky.utils import status_lib
|
|
37
40
|
from sky.utils import subprocess_utils
|
|
@@ -40,6 +43,9 @@ from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
|
40
43
|
|
|
41
44
|
if typing.TYPE_CHECKING:
|
|
42
45
|
from sky import resources as resources_lib
|
|
46
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
47
|
+
else:
|
|
48
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
43
49
|
|
|
44
50
|
logger = sky_logging.init_logger(__name__)
|
|
45
51
|
|
|
@@ -78,14 +84,15 @@ def optimize(
|
|
|
78
84
|
# is shown on `sky launch`. The optimizer is also invoked during failover,
|
|
79
85
|
# but we do not apply the admin policy there. We should apply the admin
|
|
80
86
|
# policy in the optimizer, but that will require some refactoring.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
88
|
+
dag,
|
|
89
|
+
request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
|
|
90
|
+
request_options=request_options) as dag:
|
|
91
|
+
dag.resolve_and_validate_volumes()
|
|
92
|
+
return optimizer.Optimizer.optimize(dag=dag,
|
|
93
|
+
minimize=minimize,
|
|
94
|
+
blocked_resources=blocked_resources,
|
|
95
|
+
quiet=quiet)
|
|
89
96
|
|
|
90
97
|
|
|
91
98
|
@usage_lib.entrypoint
|
|
@@ -93,7 +100,10 @@ def status(
|
|
|
93
100
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
94
101
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
95
102
|
all_users: bool = False,
|
|
96
|
-
|
|
103
|
+
include_credentials: bool = False,
|
|
104
|
+
summary_response: bool = False,
|
|
105
|
+
include_handle: bool = True,
|
|
106
|
+
) -> List[responses.StatusResponse]:
|
|
97
107
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
98
108
|
"""Gets cluster statuses.
|
|
99
109
|
|
|
@@ -160,22 +170,37 @@ def status(
|
|
|
160
170
|
provided, all clusters will be queried.
|
|
161
171
|
refresh: whether to query the latest cluster statuses from the cloud
|
|
162
172
|
provider(s).
|
|
173
|
+
include_credentials: whether to fetch ssh credentials for cluster
|
|
174
|
+
(credentials field in responses.StatusResponse)
|
|
163
175
|
|
|
164
176
|
Returns:
|
|
165
177
|
A list of dicts, with each dict containing the information of a
|
|
166
178
|
cluster. If a cluster is found to be terminated or not found, it will
|
|
167
179
|
be omitted from the returned list.
|
|
168
180
|
"""
|
|
169
|
-
clusters = backend_utils.get_clusters(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
181
|
+
clusters = backend_utils.get_clusters(
|
|
182
|
+
refresh=refresh,
|
|
183
|
+
cluster_names=cluster_names,
|
|
184
|
+
all_users=all_users,
|
|
185
|
+
include_credentials=include_credentials,
|
|
186
|
+
summary_response=summary_response,
|
|
187
|
+
include_handle=include_handle)
|
|
188
|
+
|
|
189
|
+
status_responses = []
|
|
190
|
+
for cluster in clusters:
|
|
191
|
+
try:
|
|
192
|
+
status_responses.append(
|
|
193
|
+
responses.StatusResponse.model_validate(cluster))
|
|
194
|
+
except Exception as e: # pylint: disable=broad-except
|
|
195
|
+
logger.warning('Failed to validate status responses for cluster '
|
|
196
|
+
f'{cluster.get("name")}: {e}')
|
|
197
|
+
return status_responses
|
|
173
198
|
|
|
174
199
|
|
|
175
200
|
def status_kubernetes(
|
|
176
201
|
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
177
202
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
178
|
-
List[
|
|
203
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
179
204
|
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
|
180
205
|
|
|
181
206
|
Managed jobs and services are also included in the clusters returned.
|
|
@@ -250,6 +275,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
|
250
275
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
|
251
276
|
for c in unmanaged_clusters
|
|
252
277
|
]
|
|
278
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
253
279
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
254
280
|
|
|
255
281
|
|
|
@@ -262,22 +288,26 @@ def endpoints(cluster: str,
|
|
|
262
288
|
port: The port number to get the endpoint for. If None, endpoints
|
|
263
289
|
for all ports are returned..
|
|
264
290
|
|
|
265
|
-
Returns: A dictionary of port numbers to endpoints. If
|
|
291
|
+
Returns: A dictionary of port numbers to endpoints. If port is None,
|
|
266
292
|
the dictionary will contain all ports:endpoints exposed on the cluster.
|
|
267
293
|
|
|
268
294
|
Raises:
|
|
269
|
-
|
|
295
|
+
ValueError: if the cluster is not UP or the endpoint is not exposed.
|
|
270
296
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
|
271
297
|
are exposed yet.
|
|
272
298
|
"""
|
|
273
299
|
with rich_utils.safe_status(
|
|
274
300
|
ux_utils.spinner_message(
|
|
275
301
|
f'Fetching endpoints for cluster {cluster}')):
|
|
276
|
-
|
|
302
|
+
result = backend_utils.get_endpoints(cluster=cluster, port=port)
|
|
303
|
+
return result
|
|
277
304
|
|
|
278
305
|
|
|
279
306
|
@usage_lib.entrypoint
|
|
280
|
-
def cost_report(
|
|
307
|
+
def cost_report(
|
|
308
|
+
days: Optional[int] = None,
|
|
309
|
+
dashboard_summary_response: bool = False,
|
|
310
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
281
311
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
282
312
|
"""Get all cluster cost reports, including those that have been downed.
|
|
283
313
|
|
|
@@ -295,6 +325,13 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
295
325
|
'cluster_hash': (str) unique hash identifying cluster,
|
|
296
326
|
'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
|
|
297
327
|
'total_cost': (float) cost given resources and usage intervals,
|
|
328
|
+
'cloud': (str) cloud of the cluster,
|
|
329
|
+
'region': (str) region of the cluster,
|
|
330
|
+
'cpus': (str) number of vCPUs of the cluster,
|
|
331
|
+
'memory': (str) memory of the cluster,
|
|
332
|
+
'accelerators': (str) accelerators of the cluster,
|
|
333
|
+
'resources_str': (str) resources string of the cluster,
|
|
334
|
+
'resources_str_full': (str) full resources string of the cluster,
|
|
298
335
|
}
|
|
299
336
|
|
|
300
337
|
The estimated cost column indicates price for the cluster based on the type
|
|
@@ -304,25 +341,103 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
304
341
|
cache of the cluster status, and may not be accurate for the cluster with
|
|
305
342
|
autostop/use_spot set or terminated/stopped on the cloud console.
|
|
306
343
|
|
|
344
|
+
Args:
|
|
345
|
+
days: Number of days to look back from now. Active clusters are always
|
|
346
|
+
included. Historical clusters are only included if they were last
|
|
347
|
+
used within the past 'days' days. Defaults to 30 days.
|
|
348
|
+
|
|
307
349
|
Returns:
|
|
308
350
|
A list of dicts, with each dict containing the cost information of a
|
|
309
351
|
cluster.
|
|
310
352
|
"""
|
|
311
|
-
|
|
353
|
+
if days is None:
|
|
354
|
+
days = constants.COST_REPORT_DEFAULT_DAYS
|
|
312
355
|
|
|
313
|
-
|
|
314
|
-
duration = cluster_report['duration']
|
|
315
|
-
launched_nodes = cluster_report['num_nodes']
|
|
316
|
-
launched_resources = cluster_report['resources']
|
|
356
|
+
abbreviate_response = dashboard_summary_response and cluster_hashes is None
|
|
317
357
|
|
|
318
|
-
|
|
319
|
-
|
|
358
|
+
cluster_reports = global_user_state.get_clusters_from_history(
|
|
359
|
+
days=days,
|
|
360
|
+
abbreviate_response=abbreviate_response,
|
|
361
|
+
cluster_hashes=cluster_hashes)
|
|
362
|
+
logger.debug(
|
|
363
|
+
f'{len(cluster_reports)} clusters found from history with {days} days.')
|
|
320
364
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
365
|
+
def _process_cluster_report(
|
|
366
|
+
cluster_report: Dict[str, Any]) -> Dict[str, Any]:
|
|
367
|
+
"""Process cluster report by calculating cost and adding fields."""
|
|
368
|
+
# Make a copy to avoid modifying the original
|
|
369
|
+
report = cluster_report.copy()
|
|
370
|
+
|
|
371
|
+
def get_total_cost(cluster_report: dict) -> float:
|
|
372
|
+
duration = cluster_report['duration']
|
|
373
|
+
launched_nodes = cluster_report['num_nodes']
|
|
374
|
+
launched_resources = cluster_report['resources']
|
|
375
|
+
|
|
376
|
+
cost = (launched_resources.get_cost(duration) * launched_nodes)
|
|
377
|
+
return cost
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
report['total_cost'] = get_total_cost(report)
|
|
381
|
+
except Exception as e: # pylint: disable=broad-except
|
|
382
|
+
# Ok to skip the total cost as this is just for display purposes.
|
|
383
|
+
logger.warning(f'Failed to get total cost for cluster '
|
|
384
|
+
f'{report["name"]}: {str(e)}')
|
|
385
|
+
report['total_cost'] = 0.0
|
|
386
|
+
|
|
387
|
+
return report
|
|
388
|
+
|
|
389
|
+
# Process clusters in parallel
|
|
390
|
+
if not cluster_reports:
|
|
391
|
+
return []
|
|
392
|
+
|
|
393
|
+
if not abbreviate_response:
|
|
394
|
+
cluster_reports = subprocess_utils.run_in_parallel(
|
|
395
|
+
_process_cluster_report, cluster_reports)
|
|
396
|
+
|
|
397
|
+
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
398
|
+
"""Add resource fields for dashboard compatibility."""
|
|
399
|
+
if record is None:
|
|
400
|
+
return
|
|
401
|
+
resources = record.get('resources')
|
|
402
|
+
if resources is None:
|
|
403
|
+
return
|
|
404
|
+
if not dashboard_summary_response:
|
|
405
|
+
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
406
|
+
else:
|
|
407
|
+
fields = ['cloud']
|
|
408
|
+
for field in fields:
|
|
409
|
+
try:
|
|
410
|
+
record[field] = str(getattr(resources, field))
|
|
411
|
+
except Exception as e: # pylint: disable=broad-except
|
|
412
|
+
# Ok to skip the fields as this is just for display
|
|
413
|
+
# purposes.
|
|
414
|
+
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
415
|
+
f'{record["name"]}: {str(e)}')
|
|
416
|
+
record[field] = None
|
|
417
|
+
|
|
418
|
+
# Add resources_str and resources_str_full for dashboard
|
|
419
|
+
# compatibility
|
|
420
|
+
num_nodes = record.get('num_nodes', 1)
|
|
421
|
+
try:
|
|
422
|
+
resource_str_simple, resource_str_full = (
|
|
423
|
+
resources_utils.format_resource(resources,
|
|
424
|
+
simplified_only=False))
|
|
425
|
+
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
426
|
+
record['resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
427
|
+
except Exception as e: # pylint: disable=broad-except
|
|
428
|
+
logger.debug(f'Failed to get resources_str for cluster '
|
|
429
|
+
f'{record["name"]}: {str(e)}')
|
|
430
|
+
for field in fields:
|
|
431
|
+
record[field] = None
|
|
432
|
+
record['resources_str'] = '-'
|
|
433
|
+
record['resources_str_full'] = '-'
|
|
434
|
+
|
|
435
|
+
for report in cluster_reports:
|
|
436
|
+
_update_record_with_resources(report)
|
|
437
|
+
if dashboard_summary_response:
|
|
438
|
+
report.pop('usage_intervals')
|
|
439
|
+
report.pop('user_hash')
|
|
440
|
+
report.pop('resources')
|
|
326
441
|
|
|
327
442
|
return cluster_reports
|
|
328
443
|
|
|
@@ -330,6 +445,8 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
|
330
445
|
def _start(
|
|
331
446
|
cluster_name: str,
|
|
332
447
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
448
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
449
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
333
450
|
retry_until_up: bool = False,
|
|
334
451
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
335
452
|
force: bool = False,
|
|
@@ -369,9 +486,18 @@ def _start(
|
|
|
369
486
|
'supported when starting SkyPilot controllers. To '
|
|
370
487
|
f'fix: omit the {arguments_str} to use the '
|
|
371
488
|
f'default autostop settings from config.')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
489
|
+
|
|
490
|
+
# Get the autostop resources, from which we extract the correct autostop
|
|
491
|
+
# config.
|
|
492
|
+
controller_resources = controller_utils.get_controller_resources(
|
|
493
|
+
controller, [])
|
|
494
|
+
# All resources should have the same autostop config.
|
|
495
|
+
controller_autostop_config = list(
|
|
496
|
+
controller_resources)[0].autostop_config
|
|
497
|
+
if (controller_autostop_config is not None and
|
|
498
|
+
controller_autostop_config.enabled):
|
|
499
|
+
idle_minutes_to_autostop = controller_autostop_config.idle_minutes
|
|
500
|
+
down = controller_autostop_config.down
|
|
375
501
|
|
|
376
502
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
377
503
|
|
|
@@ -391,7 +517,7 @@ def _start(
|
|
|
391
517
|
all_file_mounts=None,
|
|
392
518
|
storage_mounts=storage_mounts)
|
|
393
519
|
if idle_minutes_to_autostop is not None:
|
|
394
|
-
backend.set_autostop(handle, idle_minutes_to_autostop, down
|
|
520
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for, down)
|
|
395
521
|
return handle
|
|
396
522
|
|
|
397
523
|
|
|
@@ -399,6 +525,8 @@ def _start(
|
|
|
399
525
|
def start(
|
|
400
526
|
cluster_name: str,
|
|
401
527
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
528
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
529
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
402
530
|
retry_until_up: bool = False,
|
|
403
531
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
404
532
|
force: bool = False,
|
|
@@ -453,6 +581,7 @@ def start(
|
|
|
453
581
|
'`idle_minutes_to_autostop` must be set if `down` is True.')
|
|
454
582
|
return _start(cluster_name,
|
|
455
583
|
idle_minutes_to_autostop,
|
|
584
|
+
wait_for,
|
|
456
585
|
retry_until_up,
|
|
457
586
|
down,
|
|
458
587
|
force=force)
|
|
@@ -463,7 +592,10 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
|
463
592
|
message = ('Stopping spot instances is currently not supported on '
|
|
464
593
|
f'{resources.cloud}')
|
|
465
594
|
else:
|
|
466
|
-
|
|
595
|
+
cloud_name = resources.cloud.display_name(
|
|
596
|
+
) if resources.cloud else resources.cloud
|
|
597
|
+
message = ('Stopping is currently not supported for '
|
|
598
|
+
f'{cloud_name}')
|
|
467
599
|
return message
|
|
468
600
|
|
|
469
601
|
|
|
@@ -539,6 +671,11 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
|
539
671
|
raise exceptions.ClusterDoesNotExist(
|
|
540
672
|
f'Cluster {cluster_name!r} does not exist.')
|
|
541
673
|
|
|
674
|
+
global_user_state.add_cluster_event(
|
|
675
|
+
cluster_name, status_lib.ClusterStatus.STOPPED,
|
|
676
|
+
'Cluster was stopped by user.',
|
|
677
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
678
|
+
|
|
542
679
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
543
680
|
|
|
544
681
|
if isinstance(backend, backends.CloudVmRayBackend):
|
|
@@ -566,6 +703,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
|
566
703
|
def autostop(
|
|
567
704
|
cluster_name: str,
|
|
568
705
|
idle_minutes: int,
|
|
706
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = autostop_lib.
|
|
707
|
+
DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
569
708
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
570
709
|
) -> None:
|
|
571
710
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
@@ -627,29 +766,26 @@ def autostop(
|
|
|
627
766
|
)
|
|
628
767
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
629
768
|
|
|
769
|
+
resources = handle.launched_resources.assert_launchable()
|
|
630
770
|
# Check cloud supports stopping spot instances
|
|
631
|
-
cloud =
|
|
632
|
-
assert cloud is not None, handle
|
|
771
|
+
cloud = resources.cloud
|
|
633
772
|
|
|
634
773
|
if not isinstance(backend, backends.CloudVmRayBackend):
|
|
635
774
|
raise exceptions.NotSupportedError(
|
|
636
775
|
f'{operation} cluster {cluster_name!r} with backend '
|
|
637
776
|
f'{backend.__class__.__name__!r} is not supported.')
|
|
638
|
-
|
|
777
|
+
|
|
639
778
|
# Check if autostop/autodown is required and supported
|
|
640
779
|
if not is_cancel:
|
|
641
780
|
try:
|
|
642
781
|
if down:
|
|
643
782
|
cloud.check_features_are_supported(
|
|
644
|
-
|
|
645
|
-
{clouds.CloudImplementationFeatures.AUTODOWN})
|
|
783
|
+
resources, {clouds.CloudImplementationFeatures.AUTODOWN})
|
|
646
784
|
else:
|
|
647
785
|
cloud.check_features_are_supported(
|
|
648
|
-
|
|
649
|
-
{clouds.CloudImplementationFeatures.STOP})
|
|
786
|
+
resources, {clouds.CloudImplementationFeatures.STOP})
|
|
650
787
|
cloud.check_features_are_supported(
|
|
651
|
-
|
|
652
|
-
{clouds.CloudImplementationFeatures.AUTOSTOP})
|
|
788
|
+
resources, {clouds.CloudImplementationFeatures.AUTOSTOP})
|
|
653
789
|
except exceptions.NotSupportedError as e:
|
|
654
790
|
raise exceptions.NotSupportedError(
|
|
655
791
|
f'{colorama.Fore.YELLOW}{operation} on cluster '
|
|
@@ -658,7 +794,7 @@ def autostop(
|
|
|
658
794
|
f'see reason above.') from e
|
|
659
795
|
|
|
660
796
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
661
|
-
backend.set_autostop(handle, idle_minutes, down)
|
|
797
|
+
backend.set_autostop(handle, idle_minutes, wait_for, down)
|
|
662
798
|
|
|
663
799
|
|
|
664
800
|
# ==================
|
|
@@ -669,7 +805,7 @@ def autostop(
|
|
|
669
805
|
@usage_lib.entrypoint
|
|
670
806
|
def queue(cluster_name: str,
|
|
671
807
|
skip_finished: bool = False,
|
|
672
|
-
all_users: bool = False) -> List[
|
|
808
|
+
all_users: bool = False) -> List[responses.ClusterJobRecord]:
|
|
673
809
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
674
810
|
"""Gets the job queue of a cluster.
|
|
675
811
|
|
|
@@ -703,10 +839,10 @@ def queue(cluster_name: str,
|
|
|
703
839
|
exceptions.CommandError: if failed to get the job queue with ssh.
|
|
704
840
|
"""
|
|
705
841
|
all_jobs = not skip_finished
|
|
706
|
-
user_hash: Optional[str] = common_utils.get_user_hash()
|
|
707
842
|
if all_users:
|
|
708
843
|
user_hash = None
|
|
709
|
-
|
|
844
|
+
else:
|
|
845
|
+
user_hash = common_utils.get_current_user().id
|
|
710
846
|
|
|
711
847
|
handle = backend_utils.check_cluster_available(
|
|
712
848
|
cluster_name,
|
|
@@ -714,18 +850,49 @@ def queue(cluster_name: str,
|
|
|
714
850
|
)
|
|
715
851
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
716
852
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
853
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
854
|
+
|
|
855
|
+
if not use_legacy:
|
|
856
|
+
try:
|
|
857
|
+
request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
|
|
858
|
+
all_jobs=all_jobs)
|
|
859
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
860
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
861
|
+
handle.get_grpc_channel()).get_job_queue(request))
|
|
862
|
+
jobs = []
|
|
863
|
+
for job_info in response.jobs:
|
|
864
|
+
job_dict = {
|
|
865
|
+
'job_id': job_info.job_id,
|
|
866
|
+
'job_name': job_info.job_name,
|
|
867
|
+
'submitted_at': job_info.submitted_at,
|
|
868
|
+
'status': job_lib.JobStatus.from_protobuf(job_info.status),
|
|
869
|
+
'run_timestamp': job_info.run_timestamp,
|
|
870
|
+
'start_at': job_info.start_at
|
|
871
|
+
if job_info.HasField('start_at') else None,
|
|
872
|
+
'end_at': job_info.end_at
|
|
873
|
+
if job_info.HasField('end_at') else None,
|
|
874
|
+
'resources': job_info.resources,
|
|
875
|
+
'log_path': job_info.log_path,
|
|
876
|
+
'user_hash': job_info.username,
|
|
877
|
+
}
|
|
878
|
+
# Copied from job_lib.load_job_queue.
|
|
879
|
+
user = global_user_state.get_user(job_dict['user_hash'])
|
|
880
|
+
job_dict['username'] = user.name if user is not None else None
|
|
881
|
+
jobs.append(job_dict)
|
|
882
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
883
|
+
use_legacy = True
|
|
884
|
+
if use_legacy:
|
|
885
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
886
|
+
returncode, jobs_payload, stderr = backend.run_on_head(
|
|
887
|
+
handle, code, require_outputs=True, separate_stderr=True)
|
|
888
|
+
subprocess_utils.handle_returncode(
|
|
889
|
+
returncode,
|
|
890
|
+
command=code,
|
|
891
|
+
error_msg=f'Failed to get job queue on cluster {cluster_name}.',
|
|
892
|
+
stderr=f'{jobs_payload + stderr}',
|
|
893
|
+
stream_logs=True)
|
|
894
|
+
jobs = job_lib.load_job_queue(jobs_payload)
|
|
895
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
729
896
|
|
|
730
897
|
|
|
731
898
|
@usage_lib.entrypoint
|
|
@@ -795,8 +962,10 @@ def cancel(
|
|
|
795
962
|
f'handle for cluster {cluster_name!r} should not be None')
|
|
796
963
|
|
|
797
964
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
965
|
+
user_hash: Optional[str] = common_utils.get_current_user().id
|
|
798
966
|
|
|
799
967
|
if all_users:
|
|
968
|
+
user_hash = None
|
|
800
969
|
sky_logging.print(
|
|
801
970
|
f'{colorama.Fore.YELLOW}'
|
|
802
971
|
f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
|
|
@@ -821,7 +990,7 @@ def cancel(
|
|
|
821
990
|
backend.cancel_jobs(handle,
|
|
822
991
|
job_ids,
|
|
823
992
|
cancel_all=all or all_users,
|
|
824
|
-
user_hash=
|
|
993
|
+
user_hash=user_hash)
|
|
825
994
|
|
|
826
995
|
|
|
827
996
|
@usage_lib.entrypoint
|
|
@@ -859,7 +1028,12 @@ def tail_logs(cluster_name: str,
|
|
|
859
1028
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
860
1029
|
|
|
861
1030
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
862
|
-
|
|
1031
|
+
# Although tail_logs returns an int when require_outputs=False (default),
|
|
1032
|
+
# we need to check returnval as an int to avoid type checking errors.
|
|
1033
|
+
returnval = backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
|
1034
|
+
assert isinstance(returnval,
|
|
1035
|
+
int), (f'returnval must be an int, but got {returnval}')
|
|
1036
|
+
return returnval
|
|
863
1037
|
|
|
864
1038
|
|
|
865
1039
|
@usage_lib.entrypoint
|
|
@@ -958,25 +1132,25 @@ def job_status(cluster_name: str,
|
|
|
958
1132
|
# = Storage Management =
|
|
959
1133
|
# ======================
|
|
960
1134
|
@usage_lib.entrypoint
|
|
961
|
-
def storage_ls() -> List[
|
|
1135
|
+
def storage_ls() -> List[responses.StorageRecord]:
|
|
962
1136
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
963
1137
|
"""Gets the storages.
|
|
964
1138
|
|
|
965
1139
|
Returns:
|
|
966
|
-
[
|
|
967
|
-
{
|
|
968
|
-
'name': str,
|
|
969
|
-
'launched_at': int timestamp of creation,
|
|
970
|
-
'store': List[sky.StoreType],
|
|
971
|
-
'last_use': int timestamp of last use,
|
|
972
|
-
'status': sky.StorageStatus,
|
|
973
|
-
}
|
|
974
|
-
]
|
|
1140
|
+
List[responses.StorageRecord]: A list of storage records.
|
|
975
1141
|
"""
|
|
976
1142
|
storages = global_user_state.get_storage()
|
|
1143
|
+
storage_records = []
|
|
977
1144
|
for storage in storages:
|
|
978
|
-
|
|
979
|
-
|
|
1145
|
+
storage_records.append(
|
|
1146
|
+
responses.StorageRecord(
|
|
1147
|
+
name=storage['name'],
|
|
1148
|
+
launched_at=storage['launched_at'],
|
|
1149
|
+
store=list(storage.pop('handle').sky_stores.keys()),
|
|
1150
|
+
last_use=storage['last_use'],
|
|
1151
|
+
status=storage['status'],
|
|
1152
|
+
))
|
|
1153
|
+
return storage_records
|
|
980
1154
|
|
|
981
1155
|
|
|
982
1156
|
@usage_lib.entrypoint
|
|
@@ -992,9 +1166,7 @@ def storage_delete(name: str) -> None:
|
|
|
992
1166
|
if handle is None:
|
|
993
1167
|
raise ValueError(f'Storage name {name!r} not found.')
|
|
994
1168
|
else:
|
|
995
|
-
storage_object = data.Storage(
|
|
996
|
-
source=handle.source,
|
|
997
|
-
sync_on_reconstruction=False)
|
|
1169
|
+
storage_object = data.Storage.from_handle(handle)
|
|
998
1170
|
storage_object.delete()
|
|
999
1171
|
|
|
1000
1172
|
|
|
@@ -1002,20 +1174,49 @@ def storage_delete(name: str) -> None:
|
|
|
1002
1174
|
# = Catalog Observe =
|
|
1003
1175
|
# ===================
|
|
1004
1176
|
@usage_lib.entrypoint
|
|
1005
|
-
def enabled_clouds(
|
|
1006
|
-
|
|
1007
|
-
|
|
1177
|
+
def enabled_clouds(workspace: Optional[str] = None,
|
|
1178
|
+
expand: bool = False) -> List[str]:
|
|
1179
|
+
if workspace is None:
|
|
1180
|
+
workspace = skypilot_config.get_active_workspace()
|
|
1181
|
+
cached_clouds = global_user_state.get_cached_enabled_clouds(
|
|
1182
|
+
sky_cloud.CloudCapability.COMPUTE, workspace=workspace)
|
|
1183
|
+
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
1184
|
+
if not expand:
|
|
1185
|
+
return [cloud.canonical_name() for cloud in cached_clouds]
|
|
1186
|
+
enabled_ssh_infras = []
|
|
1187
|
+
enabled_k8s_infras = []
|
|
1188
|
+
enabled_cloud_infras = []
|
|
1189
|
+
for cloud in cached_clouds:
|
|
1190
|
+
cloud_infra = cloud.expand_infras()
|
|
1191
|
+
if isinstance(cloud, clouds.SSH):
|
|
1192
|
+
enabled_ssh_infras.extend(cloud_infra)
|
|
1193
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
|
1194
|
+
enabled_k8s_infras.extend(cloud_infra)
|
|
1195
|
+
else:
|
|
1196
|
+
enabled_cloud_infras.extend(cloud_infra)
|
|
1197
|
+
all_infras = sorted(enabled_ssh_infras) + sorted(
|
|
1198
|
+
enabled_k8s_infras) + sorted(enabled_cloud_infras)
|
|
1199
|
+
return all_infras
|
|
1008
1200
|
|
|
1009
1201
|
|
|
1010
1202
|
@usage_lib.entrypoint
|
|
1011
1203
|
def realtime_kubernetes_gpu_availability(
|
|
1012
1204
|
context: Optional[str] = None,
|
|
1013
1205
|
name_filter: Optional[str] = None,
|
|
1014
|
-
quantity_filter: Optional[int] = None
|
|
1206
|
+
quantity_filter: Optional[int] = None,
|
|
1207
|
+
is_ssh: Optional[bool] = None
|
|
1015
1208
|
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
|
1016
1209
|
|
|
1017
1210
|
if context is None:
|
|
1018
|
-
|
|
1211
|
+
# Include contexts from both Kubernetes and SSH clouds
|
|
1212
|
+
kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1213
|
+
ssh_contexts = clouds.SSH.existing_allowed_contexts()
|
|
1214
|
+
if is_ssh is None:
|
|
1215
|
+
context_list = kubernetes_contexts + ssh_contexts
|
|
1216
|
+
elif is_ssh:
|
|
1217
|
+
context_list = ssh_contexts
|
|
1218
|
+
else:
|
|
1219
|
+
context_list = kubernetes_contexts
|
|
1019
1220
|
else:
|
|
1020
1221
|
context_list = [context]
|
|
1021
1222
|
|
|
@@ -1024,9 +1225,9 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1024
1225
|
name_filter: Optional[str] = None,
|
|
1025
1226
|
quantity_filter: Optional[int] = None
|
|
1026
1227
|
) -> List[models.RealtimeGpuAvailability]:
|
|
1027
|
-
counts, capacity, available =
|
|
1228
|
+
counts, capacity, available = catalog.list_accelerator_realtime(
|
|
1028
1229
|
gpus_only=True,
|
|
1029
|
-
clouds='kubernetes',
|
|
1230
|
+
clouds='ssh' if is_ssh else 'kubernetes',
|
|
1030
1231
|
name_filter=name_filter,
|
|
1031
1232
|
region_filter=context,
|
|
1032
1233
|
quantity_filter=quantity_filter,
|
|
@@ -1058,16 +1259,19 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1058
1259
|
name_filter=name_filter,
|
|
1059
1260
|
quantity_filter=quantity_filter), context_list)
|
|
1060
1261
|
|
|
1262
|
+
cloud_identity = 'ssh' if is_ssh else 'kubernetes'
|
|
1263
|
+
cloud_identity_capital = 'SSH' if is_ssh else 'Kubernetes'
|
|
1264
|
+
|
|
1061
1265
|
for ctx, queried in zip(context_list, parallel_queried):
|
|
1062
1266
|
cumulative_count += len(queried)
|
|
1063
1267
|
if len(queried) == 0:
|
|
1064
1268
|
# don't add gpu results for clusters that don't have any
|
|
1065
|
-
logger.debug(f'No gpus found in
|
|
1269
|
+
logger.debug(f'No gpus found in {cloud_identity} cluster {ctx}')
|
|
1066
1270
|
continue
|
|
1067
1271
|
availability_lists.append((ctx, queried))
|
|
1068
1272
|
|
|
1069
1273
|
if cumulative_count == 0:
|
|
1070
|
-
err_msg = 'No GPUs found in any
|
|
1274
|
+
err_msg = f'No GPUs found in any {cloud_identity_capital} clusters. '
|
|
1071
1275
|
debug_msg = 'To further debug, run: sky check '
|
|
1072
1276
|
if name_filter is not None:
|
|
1073
1277
|
gpu_info_msg = f' {name_filter!r}'
|
|
@@ -1075,9 +1279,9 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1075
1279
|
gpu_info_msg += (' with requested quantity'
|
|
1076
1280
|
f' {quantity_filter}')
|
|
1077
1281
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
|
1078
|
-
'in
|
|
1079
|
-
debug_msg = ('To show available accelerators on
|
|
1080
|
-
' run: sky show-gpus --cloud
|
|
1282
|
+
f'in {cloud_identity_capital} clusters. ')
|
|
1283
|
+
debug_msg = (f'To show available accelerators on {cloud_identity}, '
|
|
1284
|
+
f' run: sky show-gpus --cloud {cloud_identity} ')
|
|
1081
1285
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
|
1082
1286
|
debug_msg)
|
|
1083
1287
|
raise ValueError(full_err_msg)
|
|
@@ -1094,7 +1298,9 @@ def local_up(gpus: bool,
|
|
|
1094
1298
|
ssh_key: Optional[str],
|
|
1095
1299
|
cleanup: bool,
|
|
1096
1300
|
context_name: Optional[str] = None,
|
|
1097
|
-
password: Optional[str] = None
|
|
1301
|
+
password: Optional[str] = None,
|
|
1302
|
+
name: Optional[str] = None,
|
|
1303
|
+
port_start: Optional[int] = None) -> None:
|
|
1098
1304
|
"""Creates a local or remote cluster."""
|
|
1099
1305
|
|
|
1100
1306
|
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
|
@@ -1124,54 +1330,58 @@ def local_up(gpus: bool,
|
|
|
1124
1330
|
password)
|
|
1125
1331
|
else:
|
|
1126
1332
|
# Run local deployment (kind) if no remote args are specified
|
|
1127
|
-
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
|
1333
|
+
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
|
|
1128
1334
|
|
|
1129
1335
|
|
|
1130
|
-
def local_down() -> None:
|
|
1336
|
+
def local_down(name: Optional[str] = None) -> None:
|
|
1131
1337
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1132
|
-
|
|
1338
|
+
kubernetes_deploy_utils.teardown_local_cluster(name)
|
|
1133
1339
|
|
|
1134
|
-
path_to_package = os.path.dirname(__file__)
|
|
1135
|
-
down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
|
|
1136
|
-
'delete_cluster.sh')
|
|
1137
1340
|
|
|
1138
|
-
|
|
1139
|
-
|
|
1341
|
+
@usage_lib.entrypoint
|
|
1342
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
|
1343
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
|
1140
1344
|
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1345
|
+
Args:
|
|
1346
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
|
1347
|
+
If None, the first cluster in the file is used.
|
|
1348
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
|
1349
|
+
"""
|
|
1350
|
+
kubernetes_deploy_utils.deploy_ssh_cluster(
|
|
1351
|
+
cleanup=cleanup,
|
|
1352
|
+
infra=infra,
|
|
1353
|
+
)
|
|
1145
1354
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1355
|
+
|
|
1356
|
+
@usage_lib.entrypoint
|
|
1357
|
+
def ssh_status(context_name: str) -> Tuple[bool, str]:
|
|
1358
|
+
"""Check the status of an SSH Node Pool context.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
context_name: The SSH context name (e.g., 'ssh-my-cluster')
|
|
1362
|
+
|
|
1363
|
+
Returns:
|
|
1364
|
+
Tuple[bool, str]: (is_ready, reason)
|
|
1365
|
+
- is_ready: True if the SSH Node Pool is ready, False otherwise
|
|
1366
|
+
- reason: Explanation of the status
|
|
1367
|
+
"""
|
|
1368
|
+
try:
|
|
1369
|
+
is_ready, reason = clouds.SSH.check_single_context(context_name)
|
|
1370
|
+
return is_ready, reason
|
|
1371
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1372
|
+
return False, ('Failed to check SSH context: '
|
|
1373
|
+
f'{common_utils.format_exception(e)}')
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
def get_all_contexts() -> List[str]:
|
|
1377
|
+
"""Get all available contexts from Kubernetes and SSH clouds.
|
|
1378
|
+
|
|
1379
|
+
Returns:
|
|
1380
|
+
List[str]: A list of all available context names.
|
|
1381
|
+
"""
|
|
1382
|
+
kube_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1383
|
+
ssh_contexts = clouds.SSH.get_ssh_node_pool_contexts()
|
|
1384
|
+
# Ensure ssh_contexts are prefixed appropriately if not already
|
|
1385
|
+
# For now, assuming get_ssh_node_pool_contexts already returns them
|
|
1386
|
+
# in the desired format (e.g., 'ssh-my-cluster')
|
|
1387
|
+
return sorted(list(set(kube_contexts + ssh_contexts)))
|