skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
|
@@ -1,38 +1,120 @@
|
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
|
2
|
+
import collections
|
|
2
3
|
import dataclasses
|
|
4
|
+
import re
|
|
3
5
|
import textwrap
|
|
4
|
-
|
|
6
|
+
import typing
|
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
5
8
|
|
|
6
9
|
import colorama
|
|
7
10
|
|
|
11
|
+
from sky import catalog
|
|
8
12
|
from sky import check as sky_check
|
|
9
13
|
from sky import clouds
|
|
10
14
|
from sky import exceptions
|
|
11
15
|
from sky import sky_logging
|
|
12
16
|
from sky import skypilot_config
|
|
13
17
|
from sky.clouds import cloud as sky_cloud
|
|
14
|
-
from sky.clouds import service_catalog
|
|
15
18
|
from sky.provision import docker_utils
|
|
19
|
+
from sky.provision.gcp import constants as gcp_constants
|
|
16
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
21
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
22
|
+
from sky.skylet import autostop_lib
|
|
17
23
|
from sky.skylet import constants
|
|
18
24
|
from sky.utils import accelerator_registry
|
|
19
25
|
from sky.utils import annotations
|
|
20
26
|
from sky.utils import common_utils
|
|
21
27
|
from sky.utils import config_utils
|
|
28
|
+
from sky.utils import infra_utils
|
|
22
29
|
from sky.utils import log_utils
|
|
23
30
|
from sky.utils import registry
|
|
24
31
|
from sky.utils import resources_utils
|
|
25
32
|
from sky.utils import schemas
|
|
26
33
|
from sky.utils import ux_utils
|
|
27
34
|
|
|
35
|
+
if typing.TYPE_CHECKING:
|
|
36
|
+
from sky.utils import volume as volume_lib
|
|
37
|
+
|
|
28
38
|
logger = sky_logging.init_logger(__name__)
|
|
29
39
|
|
|
30
|
-
|
|
40
|
+
DEFAULT_DISK_SIZE_GB = 256
|
|
31
41
|
|
|
32
42
|
RESOURCE_CONFIG_ALIASES = {
|
|
33
43
|
'gpus': 'accelerators',
|
|
34
44
|
}
|
|
35
45
|
|
|
46
|
+
MEMORY_SIZE_UNITS = {
|
|
47
|
+
'b': 1,
|
|
48
|
+
'k': 2**10,
|
|
49
|
+
'kb': 2**10,
|
|
50
|
+
'm': 2**20,
|
|
51
|
+
'mb': 2**20,
|
|
52
|
+
'g': 2**30,
|
|
53
|
+
'gb': 2**30,
|
|
54
|
+
't': 2**40,
|
|
55
|
+
'tb': 2**40,
|
|
56
|
+
'p': 2**50,
|
|
57
|
+
'pb': 2**50,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclasses.dataclass
|
|
62
|
+
class AutostopConfig:
|
|
63
|
+
"""Configuration for autostop."""
|
|
64
|
+
# enabled isn't present in the yaml config, but it's needed for this class
|
|
65
|
+
# to be complete.
|
|
66
|
+
enabled: bool
|
|
67
|
+
# If enabled is False, these values are ignored.
|
|
68
|
+
# Keep the default value to 0 to make the behavior consistent with the CLI
|
|
69
|
+
# flags.
|
|
70
|
+
idle_minutes: int = 0
|
|
71
|
+
down: bool = False
|
|
72
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
73
|
+
|
|
74
|
+
def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
|
|
75
|
+
if not self.enabled:
|
|
76
|
+
return False
|
|
77
|
+
config: Dict[str, Any] = {
|
|
78
|
+
'idle_minutes': self.idle_minutes,
|
|
79
|
+
'down': self.down,
|
|
80
|
+
}
|
|
81
|
+
if self.wait_for is not None:
|
|
82
|
+
config['wait_for'] = self.wait_for.value
|
|
83
|
+
return config
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_yaml_config(
|
|
87
|
+
cls, config: Union[bool, int, str, Dict[str, Any], None]
|
|
88
|
+
) -> Optional['AutostopConfig']:
|
|
89
|
+
if isinstance(config, bool):
|
|
90
|
+
if config:
|
|
91
|
+
return cls(enabled=True)
|
|
92
|
+
else:
|
|
93
|
+
return cls(enabled=False)
|
|
94
|
+
|
|
95
|
+
if isinstance(config, int):
|
|
96
|
+
return cls(idle_minutes=config, down=False, enabled=True)
|
|
97
|
+
|
|
98
|
+
if isinstance(config, str):
|
|
99
|
+
return cls(idle_minutes=resources_utils.parse_time_minutes(config),
|
|
100
|
+
down=False,
|
|
101
|
+
enabled=True)
|
|
102
|
+
|
|
103
|
+
if isinstance(config, dict):
|
|
104
|
+
# If we have a dict, autostop is enabled. (Only way to disable is
|
|
105
|
+
# with `false`, a bool.)
|
|
106
|
+
autostop_config = cls(enabled=True)
|
|
107
|
+
if 'idle_minutes' in config:
|
|
108
|
+
autostop_config.idle_minutes = config['idle_minutes']
|
|
109
|
+
if 'down' in config:
|
|
110
|
+
autostop_config.down = config['down']
|
|
111
|
+
if 'wait_for' in config:
|
|
112
|
+
autostop_config.wait_for = (
|
|
113
|
+
autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
|
|
114
|
+
return autostop_config
|
|
115
|
+
|
|
116
|
+
return None
|
|
117
|
+
|
|
36
118
|
|
|
37
119
|
class Resources:
|
|
38
120
|
"""Resources: compute requirements of Tasks.
|
|
@@ -51,7 +133,7 @@ class Resources:
|
|
|
51
133
|
"""
|
|
52
134
|
# If any fields changed, increment the version. For backward compatibility,
|
|
53
135
|
# modify the __setstate__ method to handle the old version.
|
|
54
|
-
_VERSION =
|
|
136
|
+
_VERSION = 28
|
|
55
137
|
|
|
56
138
|
def __init__(
|
|
57
139
|
self,
|
|
@@ -59,17 +141,23 @@ class Resources:
|
|
|
59
141
|
instance_type: Optional[str] = None,
|
|
60
142
|
cpus: Union[None, int, float, str] = None,
|
|
61
143
|
memory: Union[None, int, float, str] = None,
|
|
62
|
-
accelerators: Union[None, str, Dict[str, int]] = None,
|
|
144
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
|
|
63
145
|
accelerator_args: Optional[Dict[str, str]] = None,
|
|
146
|
+
infra: Optional[str] = None,
|
|
64
147
|
use_spot: Optional[bool] = None,
|
|
65
|
-
job_recovery: Optional[Union[Dict[str, Union[str, int]]
|
|
148
|
+
job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
|
|
149
|
+
str]] = None,
|
|
66
150
|
region: Optional[str] = None,
|
|
67
151
|
zone: Optional[str] = None,
|
|
68
|
-
image_id: Union[Dict[str, str], str, None] = None,
|
|
69
|
-
disk_size: Optional[int] = None,
|
|
152
|
+
image_id: Union[Dict[Optional[str], str], str, None] = None,
|
|
153
|
+
disk_size: Optional[Union[str, int]] = None,
|
|
70
154
|
disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
|
|
155
|
+
network_tier: Optional[Union[str, resources_utils.NetworkTier]] = None,
|
|
71
156
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
|
72
157
|
labels: Optional[Dict[str, str]] = None,
|
|
158
|
+
autostop: Union[bool, int, str, Dict[str, Any], None] = None,
|
|
159
|
+
priority: Optional[int] = None,
|
|
160
|
+
volumes: Optional[List[Dict[str, Any]]] = None,
|
|
73
161
|
# Internal use only.
|
|
74
162
|
# pylint: disable=invalid-name
|
|
75
163
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
|
@@ -77,6 +165,7 @@ class Resources:
|
|
|
77
165
|
_is_image_managed: Optional[bool] = None,
|
|
78
166
|
_requires_fuse: Optional[bool] = None,
|
|
79
167
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
|
168
|
+
_no_missing_accel_warnings: Optional[bool] = None,
|
|
80
169
|
):
|
|
81
170
|
"""Initialize a Resources object.
|
|
82
171
|
|
|
@@ -87,9 +176,9 @@ class Resources:
|
|
|
87
176
|
.. code-block:: python
|
|
88
177
|
|
|
89
178
|
# Fully specified cloud and instance type (is_launchable() is True).
|
|
90
|
-
sky.Resources(
|
|
91
|
-
sky.Resources(
|
|
92
|
-
sky.Resources(
|
|
179
|
+
sky.Resources(infra='aws', instance_type='p3.2xlarge')
|
|
180
|
+
sky.Resources(infra='k8s/my-cluster-ctx', accelerators='V100')
|
|
181
|
+
sky.Resources(infra='gcp/us-central1', accelerators='V100')
|
|
93
182
|
|
|
94
183
|
# Specifying required resources; the system decides the
|
|
95
184
|
# cloud/instance type. The below are equivalent:
|
|
@@ -98,8 +187,9 @@ class Resources:
|
|
|
98
187
|
sky.Resources(accelerators={'V100': 1})
|
|
99
188
|
sky.Resources(cpus='2+', memory='16+', accelerators='V100')
|
|
100
189
|
|
|
190
|
+
|
|
101
191
|
Args:
|
|
102
|
-
cloud: the cloud to use.
|
|
192
|
+
cloud: the cloud to use. Deprecated. Use `infra` instead.
|
|
103
193
|
instance_type: the instance type to use.
|
|
104
194
|
cpus: the number of CPUs required for the task.
|
|
105
195
|
If a str, must be a string of the form ``'2'`` or ``'2+'``, where
|
|
@@ -113,6 +203,11 @@ class Resources:
|
|
|
113
203
|
dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
|
|
114
204
|
accelerator_args: accelerator-specific arguments. For example,
|
|
115
205
|
``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
|
|
206
|
+
infra: a string specifying the infrastructure to use, in the format
|
|
207
|
+
of "cloud/region" or "cloud/region/zone". For example,
|
|
208
|
+
`aws/us-east-1` or `k8s/my-cluster-ctx`. This is an alternative to
|
|
209
|
+
specifying cloud, region, and zone separately. If provided, it
|
|
210
|
+
takes precedence over cloud, region, and zone parameters.
|
|
116
211
|
use_spot: whether to use spot instances. If None, defaults to
|
|
117
212
|
False.
|
|
118
213
|
job_recovery: the job recovery strategy to use for the managed
|
|
@@ -125,8 +220,8 @@ class Resources:
|
|
|
125
220
|
- max_restarts_on_errors: the max number of restarts on user code
|
|
126
221
|
errors.
|
|
127
222
|
|
|
128
|
-
region: the region to use.
|
|
129
|
-
zone: the zone to use.
|
|
223
|
+
region: the region to use. Deprecated. Use `infra` instead.
|
|
224
|
+
zone: the zone to use. Deprecated. Use `infra` instead.
|
|
130
225
|
image_id: the image ID to use. If a str, must be a string
|
|
131
226
|
of the image id from the cloud, such as AWS:
|
|
132
227
|
``'ami-1234567890abcdef0'``, GCP:
|
|
@@ -145,6 +240,8 @@ class Resources:
|
|
|
145
240
|
disk_size: the size of the OS disk in GiB.
|
|
146
241
|
disk_tier: the disk performance tier to use. If None, defaults to
|
|
147
242
|
``'medium'``.
|
|
243
|
+
network_tier: the network performance tier to use. If None, defaults to
|
|
244
|
+
``'standard'``.
|
|
148
245
|
ports: the ports to open on the instance.
|
|
149
246
|
labels: the labels to apply to the instance. These are useful for
|
|
150
247
|
assigning metadata that may be used by external tools.
|
|
@@ -152,6 +249,12 @@ class Resources:
|
|
|
152
249
|
instance tags. On GCP, labels map to instance labels. On
|
|
153
250
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
|
154
251
|
not supported and will be ignored.
|
|
252
|
+
autostop: the autostop configuration to use. For launched resources,
|
|
253
|
+
may or may not correspond to the actual current autostop config.
|
|
254
|
+
priority: the priority for this resource configuration. Must be an
|
|
255
|
+
integer from -1000 to 1000, where higher values indicate higher priority.
|
|
256
|
+
If None, no priority is set.
|
|
257
|
+
volumes: the volumes to mount on the instance.
|
|
155
258
|
_docker_login_config: the docker configuration to use. This includes
|
|
156
259
|
the docker username, password, and registry server. If None, skip
|
|
157
260
|
docker login.
|
|
@@ -169,6 +272,25 @@ class Resources:
|
|
|
169
272
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
|
170
273
|
"""
|
|
171
274
|
self._version = self._VERSION
|
|
275
|
+
|
|
276
|
+
if infra is not None and (cloud is not None or region is not None or
|
|
277
|
+
zone is not None):
|
|
278
|
+
with ux_utils.print_exception_no_traceback():
|
|
279
|
+
raise ValueError('Cannot specify both `infra` and `cloud`, '
|
|
280
|
+
'`region`, or `zone` parameters. '
|
|
281
|
+
f'Got: infra={infra}, cloud={cloud}, '
|
|
282
|
+
f'region={region}, zone={zone}')
|
|
283
|
+
|
|
284
|
+
# Infra is user facing, and cloud, region, zone in parameters are for
|
|
285
|
+
# backward compatibility. Internally, we keep using cloud, region, zone
|
|
286
|
+
# for simplicity.
|
|
287
|
+
if infra is not None:
|
|
288
|
+
infra_info = infra_utils.InfraInfo.from_str(infra)
|
|
289
|
+
# Infra takes precedence over individually specified parameters
|
|
290
|
+
cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
|
291
|
+
region = infra_info.region
|
|
292
|
+
zone = infra_info.zone
|
|
293
|
+
|
|
172
294
|
self._cloud = cloud
|
|
173
295
|
self._region: Optional[str] = region
|
|
174
296
|
self._zone: Optional[str] = zone
|
|
@@ -177,7 +299,8 @@ class Resources:
|
|
|
177
299
|
|
|
178
300
|
self._use_spot_specified = use_spot is not None
|
|
179
301
|
self._use_spot = use_spot if use_spot is not None else False
|
|
180
|
-
self._job_recovery: Optional[Dict[str, Union[str,
|
|
302
|
+
self._job_recovery: Optional[Dict[str, Optional[Union[str,
|
|
303
|
+
int]]]] = None
|
|
181
304
|
if job_recovery is not None:
|
|
182
305
|
if isinstance(job_recovery, str):
|
|
183
306
|
job_recovery = {'strategy': job_recovery}
|
|
@@ -188,20 +311,17 @@ class Resources:
|
|
|
188
311
|
if strategy_name == 'none':
|
|
189
312
|
self._job_recovery = None
|
|
190
313
|
else:
|
|
191
|
-
if strategy_name
|
|
314
|
+
if isinstance(strategy_name, str):
|
|
192
315
|
job_recovery['strategy'] = strategy_name.upper()
|
|
193
316
|
self._job_recovery = job_recovery
|
|
194
317
|
|
|
195
318
|
if disk_size is not None:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
raise ValueError(
|
|
199
|
-
f'OS disk size must be an integer. Got: {disk_size}.')
|
|
200
|
-
self._disk_size = int(disk_size)
|
|
319
|
+
self._disk_size = int(
|
|
320
|
+
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
|
201
321
|
else:
|
|
202
|
-
self._disk_size =
|
|
322
|
+
self._disk_size = DEFAULT_DISK_SIZE_GB
|
|
203
323
|
|
|
204
|
-
self._image_id =
|
|
324
|
+
self._image_id: Optional[Dict[Optional[str], str]] = None
|
|
205
325
|
if isinstance(image_id, str):
|
|
206
326
|
self._image_id = {self._region: image_id.strip()}
|
|
207
327
|
elif isinstance(image_id, dict):
|
|
@@ -209,8 +329,13 @@ class Resources:
|
|
|
209
329
|
self._image_id = {self._region: image_id[None].strip()}
|
|
210
330
|
else:
|
|
211
331
|
self._image_id = {
|
|
212
|
-
k.strip(): v.strip()
|
|
332
|
+
typing.cast(str, k).strip(): v.strip()
|
|
333
|
+
for k, v in image_id.items()
|
|
213
334
|
}
|
|
335
|
+
else:
|
|
336
|
+
self._image_id = image_id
|
|
337
|
+
if isinstance(self._cloud, clouds.Kubernetes):
|
|
338
|
+
_maybe_add_docker_prefix_to_image_id(self._image_id)
|
|
214
339
|
self._is_image_managed = _is_image_managed
|
|
215
340
|
|
|
216
341
|
if isinstance(disk_tier, str):
|
|
@@ -224,11 +349,25 @@ class Resources:
|
|
|
224
349
|
disk_tier = resources_utils.DiskTier(disk_tier_str)
|
|
225
350
|
self._disk_tier = disk_tier
|
|
226
351
|
|
|
352
|
+
if isinstance(network_tier, str):
|
|
353
|
+
network_tier_str = str(network_tier).lower()
|
|
354
|
+
supported_tiers = [
|
|
355
|
+
tier.value for tier in resources_utils.NetworkTier
|
|
356
|
+
]
|
|
357
|
+
if network_tier_str not in supported_tiers:
|
|
358
|
+
with ux_utils.print_exception_no_traceback():
|
|
359
|
+
raise ValueError(
|
|
360
|
+
f'Invalid network_tier {network_tier_str!r}. '
|
|
361
|
+
f'Network tier must be one of '
|
|
362
|
+
f'{", ".join(supported_tiers)}.')
|
|
363
|
+
network_tier = resources_utils.NetworkTier(network_tier_str)
|
|
364
|
+
self._network_tier = network_tier
|
|
365
|
+
|
|
227
366
|
if ports is not None:
|
|
228
367
|
if isinstance(ports, tuple):
|
|
229
368
|
ports = list(ports)
|
|
230
369
|
if not isinstance(ports, list):
|
|
231
|
-
ports = [ports]
|
|
370
|
+
ports = [str(ports)]
|
|
232
371
|
ports = resources_utils.simplify_ports(
|
|
233
372
|
[str(port) for port in ports])
|
|
234
373
|
if not ports:
|
|
@@ -250,11 +389,18 @@ class Resources:
|
|
|
250
389
|
self._requires_fuse = _requires_fuse
|
|
251
390
|
|
|
252
391
|
self._cluster_config_overrides = _cluster_config_overrides
|
|
253
|
-
self._cached_repr = None
|
|
392
|
+
self._cached_repr: Optional[str] = None
|
|
393
|
+
self._no_missing_accel_warnings = _no_missing_accel_warnings
|
|
394
|
+
|
|
395
|
+
# Initialize _priority before calling the setter
|
|
396
|
+
self._priority: Optional[int] = None
|
|
254
397
|
|
|
255
398
|
self._set_cpus(cpus)
|
|
256
399
|
self._set_memory(memory)
|
|
257
400
|
self._set_accelerators(accelerators, accelerator_args)
|
|
401
|
+
self._set_autostop_config(autostop)
|
|
402
|
+
self._set_priority(priority)
|
|
403
|
+
self._set_volumes(volumes)
|
|
258
404
|
|
|
259
405
|
def validate(self):
|
|
260
406
|
"""Validate the resources and infer the missing fields if possible."""
|
|
@@ -265,6 +411,7 @@ class Resources:
|
|
|
265
411
|
self._try_validate_managed_job_attributes()
|
|
266
412
|
self._try_validate_image_id()
|
|
267
413
|
self._try_validate_disk_tier()
|
|
414
|
+
self._try_validate_volumes()
|
|
268
415
|
self._try_validate_ports()
|
|
269
416
|
self._try_validate_labels()
|
|
270
417
|
|
|
@@ -273,7 +420,7 @@ class Resources:
|
|
|
273
420
|
# if it fails to fetch some account specific catalog information (e.g., AWS
|
|
274
421
|
# zone mapping). It is fine to use the default catalog as this function is
|
|
275
422
|
# only for display purposes.
|
|
276
|
-
@
|
|
423
|
+
@catalog.fallback_to_default_catalog
|
|
277
424
|
def __repr__(self) -> str:
|
|
278
425
|
"""Returns a string representation for display.
|
|
279
426
|
|
|
@@ -330,8 +477,12 @@ class Resources:
|
|
|
330
477
|
if self.disk_tier is not None:
|
|
331
478
|
disk_tier = f', disk_tier={self.disk_tier.value}'
|
|
332
479
|
|
|
480
|
+
network_tier = ''
|
|
481
|
+
if self.network_tier is not None:
|
|
482
|
+
network_tier = f', network_tier={self.network_tier.value}'
|
|
483
|
+
|
|
333
484
|
disk_size = ''
|
|
334
|
-
if self.disk_size !=
|
|
485
|
+
if self.disk_size != DEFAULT_DISK_SIZE_GB:
|
|
335
486
|
disk_size = f', disk_size={self.disk_size}'
|
|
336
487
|
|
|
337
488
|
ports = ''
|
|
@@ -349,7 +500,7 @@ class Resources:
|
|
|
349
500
|
hardware_str = (
|
|
350
501
|
f'{instance_type}{use_spot}'
|
|
351
502
|
f'{cpus}{memory}{accelerators}{accelerator_args}{image_id}'
|
|
352
|
-
f'{disk_tier}{disk_size}{ports}')
|
|
503
|
+
f'{disk_tier}{network_tier}{disk_size}{ports}')
|
|
353
504
|
# It may have leading ',' (for example, instance_type not set) or empty
|
|
354
505
|
# spaces. Remove them.
|
|
355
506
|
while hardware_str and hardware_str[0] in (',', ' '):
|
|
@@ -366,7 +517,10 @@ class Resources:
|
|
|
366
517
|
def repr_with_region_zone(self) -> str:
|
|
367
518
|
region_str = ''
|
|
368
519
|
if self.region is not None:
|
|
369
|
-
|
|
520
|
+
region_name = self.region
|
|
521
|
+
if self.region.startswith('ssh-'):
|
|
522
|
+
region_name = common_utils.removeprefix(self.region, 'ssh-')
|
|
523
|
+
region_str = f', region={region_name}'
|
|
370
524
|
zone_str = ''
|
|
371
525
|
if self.zone is not None:
|
|
372
526
|
zone_str = f', zone={self.zone}'
|
|
@@ -378,19 +532,24 @@ class Resources:
|
|
|
378
532
|
return repr_str
|
|
379
533
|
|
|
380
534
|
@property
|
|
381
|
-
def
|
|
535
|
+
def infra(self) -> infra_utils.InfraInfo:
|
|
536
|
+
cloud = str(self.cloud) if self.cloud is not None else None
|
|
537
|
+
return infra_utils.InfraInfo(cloud, self.region, self.zone)
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def cloud(self) -> Optional[clouds.Cloud]:
|
|
382
541
|
return self._cloud
|
|
383
542
|
|
|
384
543
|
@property
|
|
385
|
-
def region(self):
|
|
544
|
+
def region(self) -> Optional[str]:
|
|
386
545
|
return self._region
|
|
387
546
|
|
|
388
547
|
@property
|
|
389
|
-
def zone(self):
|
|
548
|
+
def zone(self) -> Optional[str]:
|
|
390
549
|
return self._zone
|
|
391
550
|
|
|
392
551
|
@property
|
|
393
|
-
def instance_type(self):
|
|
552
|
+
def instance_type(self) -> Optional[str]:
|
|
394
553
|
return self._instance_type
|
|
395
554
|
|
|
396
555
|
@property
|
|
@@ -432,9 +591,9 @@ class Resources:
|
|
|
432
591
|
def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
|
|
433
592
|
"""Returns the accelerators field directly or by inferring.
|
|
434
593
|
|
|
435
|
-
For example, Resources(
|
|
436
|
-
set to None, but this function will infer {'V100': 1}
|
|
437
|
-
type.
|
|
594
|
+
For example, Resources(infra='aws', instance_type='p3.2xlarge') has its
|
|
595
|
+
accelerators field set to None, but this function will infer {'V100': 1}
|
|
596
|
+
from the instance type.
|
|
438
597
|
"""
|
|
439
598
|
if self._accelerators is not None:
|
|
440
599
|
return self._accelerators
|
|
@@ -444,7 +603,7 @@ class Resources:
|
|
|
444
603
|
return None
|
|
445
604
|
|
|
446
605
|
@property
|
|
447
|
-
def accelerator_args(self) -> Optional[Dict[str,
|
|
606
|
+
def accelerator_args(self) -> Optional[Dict[str, Any]]:
|
|
448
607
|
return self._accelerator_args
|
|
449
608
|
|
|
450
609
|
@property
|
|
@@ -456,7 +615,7 @@ class Resources:
|
|
|
456
615
|
return self._use_spot_specified
|
|
457
616
|
|
|
458
617
|
@property
|
|
459
|
-
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
|
618
|
+
def job_recovery(self) -> Optional[Dict[str, Optional[Union[str, int]]]]:
|
|
460
619
|
return self._job_recovery
|
|
461
620
|
|
|
462
621
|
@property
|
|
@@ -464,13 +623,17 @@ class Resources:
|
|
|
464
623
|
return self._disk_size
|
|
465
624
|
|
|
466
625
|
@property
|
|
467
|
-
def image_id(self) -> Optional[Dict[str, str]]:
|
|
626
|
+
def image_id(self) -> Optional[Dict[Optional[str], str]]:
|
|
468
627
|
return self._image_id
|
|
469
628
|
|
|
470
629
|
@property
|
|
471
|
-
def disk_tier(self) -> resources_utils.DiskTier:
|
|
630
|
+
def disk_tier(self) -> Optional[resources_utils.DiskTier]:
|
|
472
631
|
return self._disk_tier
|
|
473
632
|
|
|
633
|
+
@property
|
|
634
|
+
def network_tier(self) -> Optional[resources_utils.NetworkTier]:
|
|
635
|
+
return self._network_tier
|
|
636
|
+
|
|
474
637
|
@property
|
|
475
638
|
def ports(self) -> Optional[List[str]]:
|
|
476
639
|
return self._ports
|
|
@@ -479,6 +642,28 @@ class Resources:
|
|
|
479
642
|
def labels(self) -> Optional[Dict[str, str]]:
|
|
480
643
|
return self._labels
|
|
481
644
|
|
|
645
|
+
@property
|
|
646
|
+
def volumes(self) -> Optional[List[Dict[str, Any]]]:
|
|
647
|
+
return self._volumes
|
|
648
|
+
|
|
649
|
+
@property
|
|
650
|
+
def autostop_config(self) -> Optional[AutostopConfig]:
|
|
651
|
+
"""The requested autostop config.
|
|
652
|
+
|
|
653
|
+
Warning: This is the autostop config that was originally used to
|
|
654
|
+
launch the resources. It may not correspond to the actual current
|
|
655
|
+
autostop config.
|
|
656
|
+
"""
|
|
657
|
+
return self._autostop_config
|
|
658
|
+
|
|
659
|
+
@property
|
|
660
|
+
def priority(self) -> Optional[int]:
|
|
661
|
+
"""The priority for this resource configuration.
|
|
662
|
+
|
|
663
|
+
Higher values indicate higher priority. Valid range is -1000 to 1000.
|
|
664
|
+
"""
|
|
665
|
+
return self._priority
|
|
666
|
+
|
|
482
667
|
@property
|
|
483
668
|
def is_image_managed(self) -> Optional[bool]:
|
|
484
669
|
return self._is_image_managed
|
|
@@ -489,15 +674,32 @@ class Resources:
|
|
|
489
674
|
return False
|
|
490
675
|
return self._requires_fuse
|
|
491
676
|
|
|
677
|
+
@property
|
|
678
|
+
def no_missing_accel_warnings(self) -> bool:
|
|
679
|
+
"""Returns whether to force quiet mode for this resource."""
|
|
680
|
+
if self._no_missing_accel_warnings is None:
|
|
681
|
+
return False
|
|
682
|
+
return self._no_missing_accel_warnings
|
|
683
|
+
|
|
684
|
+
def set_requires_fuse(self, value: bool) -> None:
|
|
685
|
+
"""Sets whether this resource requires FUSE mounting support.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
value: Whether the resource requires FUSE mounting support.
|
|
689
|
+
"""
|
|
690
|
+
# TODO(zeping): This violates the immutability of Resources.
|
|
691
|
+
# Refactor to use Resources.copy instead.
|
|
692
|
+
self._requires_fuse = value
|
|
693
|
+
|
|
492
694
|
@property
|
|
493
695
|
def cluster_config_overrides(self) -> Dict[str, Any]:
|
|
494
696
|
if self._cluster_config_overrides is None:
|
|
495
697
|
return {}
|
|
496
698
|
return self._cluster_config_overrides
|
|
497
699
|
|
|
498
|
-
@
|
|
499
|
-
def
|
|
500
|
-
self.
|
|
700
|
+
@property
|
|
701
|
+
def docker_login_config(self) -> Optional[docker_utils.DockerLoginConfig]:
|
|
702
|
+
return self._docker_login_config
|
|
501
703
|
|
|
502
704
|
@property
|
|
503
705
|
def docker_username_for_runpod(self) -> Optional[str]:
|
|
@@ -541,25 +743,27 @@ class Resources:
|
|
|
541
743
|
self._memory = None
|
|
542
744
|
return
|
|
543
745
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
memory_gb = float(num_memory_gb)
|
|
556
|
-
except ValueError:
|
|
557
|
-
with ux_utils.print_exception_no_traceback():
|
|
558
|
-
raise ValueError(
|
|
559
|
-
f'The "memory" field should be either a number or '
|
|
560
|
-
f'a string "<number>+". Found: {memory!r}') from None
|
|
746
|
+
memory = resources_utils.parse_memory_resource(str(memory),
|
|
747
|
+
'memory',
|
|
748
|
+
ret_type=float,
|
|
749
|
+
allow_plus=True,
|
|
750
|
+
allow_x=True)
|
|
751
|
+
self._memory = memory
|
|
752
|
+
if memory.endswith(('+', 'x')):
|
|
753
|
+
# 'x' is used internally for make sure our resources used by
|
|
754
|
+
# jobs controller (memory: 3x) to have enough memory based on
|
|
755
|
+
# the vCPUs.
|
|
756
|
+
num_memory_gb = memory[:-1]
|
|
561
757
|
else:
|
|
562
|
-
|
|
758
|
+
num_memory_gb = memory
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
memory_gb = float(num_memory_gb)
|
|
762
|
+
except ValueError:
|
|
763
|
+
with ux_utils.print_exception_no_traceback():
|
|
764
|
+
raise ValueError(
|
|
765
|
+
f'The "memory" field should be either a number or '
|
|
766
|
+
f'a string "<number>+". Found: {memory!r}') from None
|
|
563
767
|
|
|
564
768
|
if memory_gb <= 0:
|
|
565
769
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -568,8 +772,8 @@ class Resources:
|
|
|
568
772
|
|
|
569
773
|
def _set_accelerators(
|
|
570
774
|
self,
|
|
571
|
-
accelerators: Union[None, str, Dict[str, int]],
|
|
572
|
-
accelerator_args: Optional[Dict[str,
|
|
775
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]],
|
|
776
|
+
accelerator_args: Optional[Dict[str, Any]],
|
|
573
777
|
) -> None:
|
|
574
778
|
"""Sets accelerators.
|
|
575
779
|
|
|
@@ -582,6 +786,8 @@ class Resources:
|
|
|
582
786
|
if ':' not in accelerators:
|
|
583
787
|
accelerators = {accelerators: 1}
|
|
584
788
|
else:
|
|
789
|
+
assert isinstance(accelerators,
|
|
790
|
+
str), (type(accelerators), accelerators)
|
|
585
791
|
splits = accelerators.split(':')
|
|
586
792
|
parse_error = ('The "accelerators" field as a str '
|
|
587
793
|
'should be <name> or <name>:<cnt>. '
|
|
@@ -599,22 +805,29 @@ class Resources:
|
|
|
599
805
|
|
|
600
806
|
acc, _ = list(accelerators.items())[0]
|
|
601
807
|
if 'tpu' in acc.lower():
|
|
808
|
+
# TODO(syang): GCP TPU names are supported on both GCP and
|
|
809
|
+
# kubernetes (GKE), but this logic automatically assumes
|
|
810
|
+
# GCP TPUs can only be used on GCP.
|
|
811
|
+
# Fix the logic such that GCP TPU names can failover between
|
|
812
|
+
# GCP and kubernetes.
|
|
602
813
|
if self.cloud is None:
|
|
603
|
-
if kubernetes_utils.is_tpu_on_gke(acc):
|
|
814
|
+
if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
|
|
604
815
|
self._cloud = clouds.Kubernetes()
|
|
605
816
|
else:
|
|
606
817
|
self._cloud = clouds.GCP()
|
|
607
|
-
assert
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
818
|
+
assert self.cloud is not None and (
|
|
819
|
+
self.cloud.is_same_cloud(clouds.GCP()) or
|
|
820
|
+
self.cloud.is_same_cloud(clouds.Kubernetes())), (
|
|
821
|
+
'Cloud must be GCP or Kubernetes for TPU '
|
|
822
|
+
'accelerators.')
|
|
611
823
|
|
|
612
824
|
if accelerator_args is None:
|
|
613
825
|
accelerator_args = {}
|
|
614
826
|
|
|
615
827
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
|
616
828
|
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
|
617
|
-
not kubernetes_utils.is_tpu_on_gke(acc
|
|
829
|
+
not kubernetes_utils.is_tpu_on_gke(acc,
|
|
830
|
+
normalize=False)):
|
|
618
831
|
if 'runtime_version' not in accelerator_args:
|
|
619
832
|
|
|
620
833
|
def _get_default_runtime_version() -> str:
|
|
@@ -641,15 +854,159 @@ class Resources:
|
|
|
641
854
|
'Cannot specify instance type (got '
|
|
642
855
|
f'{self.instance_type!r}) for TPU VM.')
|
|
643
856
|
|
|
644
|
-
self._accelerators
|
|
645
|
-
|
|
857
|
+
self._accelerators: Optional[Dict[str, Union[int,
|
|
858
|
+
float]]] = accelerators
|
|
859
|
+
self._accelerator_args: Optional[Dict[str, Any]] = accelerator_args
|
|
860
|
+
|
|
861
|
+
def _set_autostop_config(
|
|
862
|
+
self,
|
|
863
|
+
autostop: Union[bool, int, str, Dict[str, Any], None],
|
|
864
|
+
) -> None:
|
|
865
|
+
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
|
866
|
+
|
|
867
|
+
def _set_priority(self, priority: Optional[int]) -> None:
|
|
868
|
+
"""Sets the priority for this resource configuration.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
priority: Priority value from -1000 to 1000, where higher values
|
|
872
|
+
indicate higher priority. If None, no priority is set.
|
|
873
|
+
"""
|
|
874
|
+
if priority is not None:
|
|
875
|
+
if not constants.MIN_PRIORITY <= priority <= constants.MAX_PRIORITY:
|
|
876
|
+
with ux_utils.print_exception_no_traceback():
|
|
877
|
+
raise ValueError(
|
|
878
|
+
f'Priority must be between {constants.MIN_PRIORITY} and'
|
|
879
|
+
f' {constants.MAX_PRIORITY}. Found: {priority}')
|
|
880
|
+
self._priority = priority
|
|
881
|
+
|
|
882
|
+
def _set_volumes(
|
|
883
|
+
self,
|
|
884
|
+
volumes: Optional[List[Dict[str, Any]]],
|
|
885
|
+
) -> None:
|
|
886
|
+
if not volumes:
|
|
887
|
+
self._volumes = None
|
|
888
|
+
return
|
|
889
|
+
valid_volumes = []
|
|
890
|
+
supported_tiers = [tier.value for tier in resources_utils.DiskTier]
|
|
891
|
+
supported_storage_types = [
|
|
892
|
+
storage_type.value for storage_type in resources_utils.StorageType
|
|
893
|
+
]
|
|
894
|
+
supported_attach_modes = [
|
|
895
|
+
attach_mode.value for attach_mode in resources_utils.DiskAttachMode
|
|
896
|
+
]
|
|
897
|
+
network_type = resources_utils.StorageType.NETWORK
|
|
898
|
+
read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
|
|
899
|
+
for volume in volumes:
|
|
900
|
+
if 'path' not in volume:
|
|
901
|
+
with ux_utils.print_exception_no_traceback():
|
|
902
|
+
raise ValueError(f'Invalid volume {volume!r}. '
|
|
903
|
+
f'Volume must have a "path" field.')
|
|
904
|
+
if 'storage_type' not in volume:
|
|
905
|
+
volume['storage_type'] = network_type
|
|
906
|
+
else:
|
|
907
|
+
if isinstance(volume['storage_type'], str):
|
|
908
|
+
storage_type_str = str(volume['storage_type']).lower()
|
|
909
|
+
if storage_type_str not in supported_storage_types:
|
|
910
|
+
logger.warning(
|
|
911
|
+
f'Invalid storage_type {storage_type_str!r}. '
|
|
912
|
+
f'Set it to '
|
|
913
|
+
f'{network_type.value}.')
|
|
914
|
+
volume['storage_type'] = network_type
|
|
915
|
+
else:
|
|
916
|
+
volume['storage_type'] = resources_utils.StorageType(
|
|
917
|
+
storage_type_str)
|
|
918
|
+
if 'auto_delete' not in volume:
|
|
919
|
+
volume['auto_delete'] = False
|
|
920
|
+
if 'attach_mode' in volume:
|
|
921
|
+
if isinstance(volume['attach_mode'], str):
|
|
922
|
+
attach_mode_str = str(volume['attach_mode']).lower()
|
|
923
|
+
if attach_mode_str not in supported_attach_modes:
|
|
924
|
+
logger.warning(
|
|
925
|
+
f'Invalid attach_mode {attach_mode_str!r}. '
|
|
926
|
+
f'Set it to {read_write_mode.value}.')
|
|
927
|
+
volume['attach_mode'] = read_write_mode
|
|
928
|
+
else:
|
|
929
|
+
volume['attach_mode'] = resources_utils.DiskAttachMode(
|
|
930
|
+
attach_mode_str)
|
|
931
|
+
else:
|
|
932
|
+
volume['attach_mode'] = read_write_mode
|
|
933
|
+
if volume['storage_type'] == network_type:
|
|
934
|
+
# TODO(luca): add units to this disk_size as well
|
|
935
|
+
if ('disk_size' in volume and
|
|
936
|
+
round(volume['disk_size']) != volume['disk_size']):
|
|
937
|
+
with ux_utils.print_exception_no_traceback():
|
|
938
|
+
raise ValueError(f'Volume size must be an integer. '
|
|
939
|
+
f'Got: {volume["size"]}.')
|
|
940
|
+
if 'name' not in volume:
|
|
941
|
+
with ux_utils.print_exception_no_traceback():
|
|
942
|
+
raise ValueError(f'Network volume {volume["path"]} '
|
|
943
|
+
f'must have "name" field.')
|
|
944
|
+
elif 'name' in volume:
|
|
945
|
+
logger.info(f'Volume {volume["path"]} is a local disk. '
|
|
946
|
+
f'The "name" field will be ignored.')
|
|
947
|
+
del volume['name']
|
|
948
|
+
if 'disk_tier' in volume:
|
|
949
|
+
if isinstance(volume['disk_tier'], str):
|
|
950
|
+
disk_tier_str = str(volume['disk_tier']).lower()
|
|
951
|
+
if disk_tier_str not in supported_tiers:
|
|
952
|
+
logger.warning(
|
|
953
|
+
f'Invalid disk_tier {disk_tier_str!r}. '
|
|
954
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
|
955
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
|
956
|
+
else:
|
|
957
|
+
volume['disk_tier'] = resources_utils.DiskTier(
|
|
958
|
+
disk_tier_str)
|
|
959
|
+
elif volume['storage_type'] == network_type:
|
|
960
|
+
logger.debug(
|
|
961
|
+
f'No disk_tier specified for volume {volume["path"]}. '
|
|
962
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
|
963
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
|
964
|
+
|
|
965
|
+
valid_volumes.append(volume)
|
|
966
|
+
self._volumes = valid_volumes
|
|
967
|
+
|
|
968
|
+
def override_autostop_config(
|
|
969
|
+
self,
|
|
970
|
+
down: bool = False,
|
|
971
|
+
idle_minutes: Optional[int] = None,
|
|
972
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
|
|
973
|
+
"""Override autostop config to the resource.
|
|
974
|
+
|
|
975
|
+
Args:
|
|
976
|
+
down: If true, override the autostop config to use autodown.
|
|
977
|
+
idle_minutes: If not None, override the idle minutes to autostop or
|
|
978
|
+
autodown.
|
|
979
|
+
wait_for: If not None, override the wait mode.
|
|
980
|
+
"""
|
|
981
|
+
if not down and idle_minutes is None:
|
|
982
|
+
return
|
|
983
|
+
if self._autostop_config is None:
|
|
984
|
+
self._autostop_config = AutostopConfig(enabled=True,)
|
|
985
|
+
if down:
|
|
986
|
+
self._autostop_config.down = down
|
|
987
|
+
if idle_minutes is not None:
|
|
988
|
+
self._autostop_config.idle_minutes = idle_minutes
|
|
989
|
+
if wait_for is not None:
|
|
990
|
+
self._autostop_config.wait_for = wait_for
|
|
646
991
|
|
|
647
992
|
def is_launchable(self) -> bool:
|
|
993
|
+
"""Returns whether the resource is launchable."""
|
|
648
994
|
return self.cloud is not None and self._instance_type is not None
|
|
649
995
|
|
|
996
|
+
def assert_launchable(self) -> 'LaunchableResources':
|
|
997
|
+
"""A workaround to make mypy understand that is_launchable() is true.
|
|
998
|
+
|
|
999
|
+
Note: The `cast` to `LaunchableResources` is only for static type
|
|
1000
|
+
checking with MyPy. At runtime, the Python interpreter does not enforce
|
|
1001
|
+
types, and the returned object will still be an instance of `Resources`.
|
|
1002
|
+
"""
|
|
1003
|
+
assert self.is_launchable(), self
|
|
1004
|
+
return typing.cast(LaunchableResources, self)
|
|
1005
|
+
|
|
650
1006
|
def need_cleanup_after_preemption_or_failure(self) -> bool:
|
|
651
1007
|
"""Whether a resource needs cleanup after preemption or failure."""
|
|
652
1008
|
assert self.is_launchable(), self
|
|
1009
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
653
1010
|
return self.cloud.need_cleanup_after_preemption_or_failure(self)
|
|
654
1011
|
|
|
655
1012
|
def _try_canonicalize_accelerators(self) -> None:
|
|
@@ -706,10 +1063,10 @@ class Resources:
|
|
|
706
1063
|
else:
|
|
707
1064
|
table = log_utils.create_table(['Cloud', 'Hint'])
|
|
708
1065
|
table.add_row(['-----', '----'])
|
|
709
|
-
for
|
|
1066
|
+
for cloud_msg, error in cloud_to_errors.items():
|
|
710
1067
|
reason_str = '\n'.join(textwrap.wrap(
|
|
711
1068
|
str(error), 80))
|
|
712
|
-
table.add_row([
|
|
1069
|
+
table.add_row([cloud_msg, reason_str])
|
|
713
1070
|
hint = table.get_string()
|
|
714
1071
|
raise ValueError(
|
|
715
1072
|
f'Invalid (region {self._region!r}, zone '
|
|
@@ -741,17 +1098,22 @@ class Resources:
|
|
|
741
1098
|
ssh_proxy_command dict with region names as keys).
|
|
742
1099
|
"""
|
|
743
1100
|
assert self.is_launchable(), self
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
1101
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
1102
|
+
assert self._instance_type is not None, (
|
|
1103
|
+
'Instance type must be specified')
|
|
1104
|
+
regions = self.cloud.regions_with_offering(self._instance_type,
|
|
1105
|
+
self.accelerators,
|
|
1106
|
+
self._use_spot, self._region,
|
|
1107
|
+
self._zone, self)
|
|
749
1108
|
if self._image_id is not None and None not in self._image_id:
|
|
750
1109
|
regions = [r for r in regions if r.name in self._image_id]
|
|
751
1110
|
|
|
752
1111
|
# Filter the regions by the skypilot_config
|
|
753
|
-
ssh_proxy_command_config = skypilot_config.
|
|
754
|
-
|
|
1112
|
+
ssh_proxy_command_config = skypilot_config.get_effective_region_config(
|
|
1113
|
+
cloud=str(self._cloud).lower(),
|
|
1114
|
+
region=None,
|
|
1115
|
+
keys=('ssh_proxy_command',),
|
|
1116
|
+
default_value=None)
|
|
755
1117
|
if (isinstance(ssh_proxy_command_config, str) or
|
|
756
1118
|
ssh_proxy_command_config is None):
|
|
757
1119
|
# All regions are valid as the regions are not specified for the
|
|
@@ -845,6 +1207,10 @@ class Resources:
|
|
|
845
1207
|
cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
|
|
846
1208
|
self._instance_type)
|
|
847
1209
|
if self._cpus is not None:
|
|
1210
|
+
assert cpus is not None, (
|
|
1211
|
+
f'Can\'t get vCPUs from instance type: '
|
|
1212
|
+
f'{self._instance_type}, check catalog or '
|
|
1213
|
+
f'specify cpus directly.')
|
|
848
1214
|
if self._cpus.endswith('+'):
|
|
849
1215
|
if cpus < float(self._cpus[:-1]):
|
|
850
1216
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -859,6 +1225,10 @@ class Resources:
|
|
|
859
1225
|
f'number of vCPUs. {self.instance_type} has {cpus} '
|
|
860
1226
|
f'vCPUs, but {self._cpus} is requested.')
|
|
861
1227
|
if self.memory is not None:
|
|
1228
|
+
assert mem is not None, (
|
|
1229
|
+
f'Can\'t get memory from instance type: '
|
|
1230
|
+
f'{self._instance_type}, check catalog or '
|
|
1231
|
+
f'specify memory directly.')
|
|
862
1232
|
if self.memory.endswith(('+', 'x')):
|
|
863
1233
|
if mem < float(self.memory[:-1]):
|
|
864
1234
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -882,16 +1252,22 @@ class Resources:
|
|
|
882
1252
|
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
|
883
1253
|
return
|
|
884
1254
|
# Validate the job recovery strategy
|
|
1255
|
+
assert isinstance(self._job_recovery['strategy'],
|
|
1256
|
+
str), 'Job recovery strategy must be a string'
|
|
885
1257
|
registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
|
|
886
1258
|
self._job_recovery['strategy'])
|
|
887
1259
|
|
|
888
1260
|
def extract_docker_image(self) -> Optional[str]:
|
|
889
1261
|
if self.image_id is None:
|
|
890
1262
|
return None
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
if
|
|
894
|
-
|
|
1263
|
+
# Handle dict image_id
|
|
1264
|
+
if len(self.image_id) == 1:
|
|
1265
|
+
# Check if the single key matches the region or is None (any region)
|
|
1266
|
+
image_key = list(self.image_id.keys())[0]
|
|
1267
|
+
if image_key == self.region or image_key is None:
|
|
1268
|
+
image_id = self.image_id[image_key]
|
|
1269
|
+
if image_id.startswith('docker:'):
|
|
1270
|
+
return image_id[len('docker:'):]
|
|
895
1271
|
return None
|
|
896
1272
|
|
|
897
1273
|
def _try_validate_image_id(self) -> None:
|
|
@@ -900,6 +1276,39 @@ class Resources:
|
|
|
900
1276
|
Raises:
|
|
901
1277
|
ValueError: if the attribute is invalid.
|
|
902
1278
|
"""
|
|
1279
|
+
|
|
1280
|
+
if self._network_tier == resources_utils.NetworkTier.BEST:
|
|
1281
|
+
if isinstance(self._cloud, clouds.GCP):
|
|
1282
|
+
# Handle GPU Direct TCPX requirement for docker images
|
|
1283
|
+
if self._image_id is None:
|
|
1284
|
+
self._image_id = {
|
|
1285
|
+
self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
|
|
1286
|
+
}
|
|
1287
|
+
elif isinstance(self._cloud, clouds.Nebius):
|
|
1288
|
+
if self._image_id is None:
|
|
1289
|
+
self._image_id = {
|
|
1290
|
+
self._region: nebius_constants.INFINIBAND_IMAGE_ID
|
|
1291
|
+
}
|
|
1292
|
+
elif self._image_id:
|
|
1293
|
+
# Custom image specified - validate it's a docker image
|
|
1294
|
+
# Check if any of the specified images are not docker images
|
|
1295
|
+
non_docker_images = []
|
|
1296
|
+
for region, image_id in self._image_id.items():
|
|
1297
|
+
if not image_id.startswith('docker:'):
|
|
1298
|
+
non_docker_images.append(
|
|
1299
|
+
f'{image_id} (region: {region})')
|
|
1300
|
+
|
|
1301
|
+
if non_docker_images:
|
|
1302
|
+
with ux_utils.print_exception_no_traceback():
|
|
1303
|
+
raise ValueError(
|
|
1304
|
+
f'When using network_tier=BEST, image_id '
|
|
1305
|
+
f'must be a docker image. '
|
|
1306
|
+
f'Found non-docker images: '
|
|
1307
|
+
f'{", ".join(non_docker_images)}. '
|
|
1308
|
+
f'Please either: (1) use a docker image '
|
|
1309
|
+
f'(prefix with "docker:"), or '
|
|
1310
|
+
f'(2) leave image_id empty to use the default')
|
|
1311
|
+
|
|
903
1312
|
if self._image_id is None:
|
|
904
1313
|
return
|
|
905
1314
|
|
|
@@ -916,37 +1325,51 @@ class Resources:
|
|
|
916
1325
|
'Cloud must be specified when image_id is provided.')
|
|
917
1326
|
|
|
918
1327
|
try:
|
|
919
|
-
self.
|
|
1328
|
+
self.cloud.check_features_are_supported(
|
|
920
1329
|
self,
|
|
921
1330
|
requested_features={
|
|
922
1331
|
clouds.CloudImplementationFeatures.IMAGE_ID
|
|
923
1332
|
})
|
|
924
1333
|
except exceptions.NotSupportedError as e:
|
|
1334
|
+
# Provide a more helpful error message for Lambda cloud
|
|
1335
|
+
if self.cloud.is_same_cloud(clouds.Lambda()):
|
|
1336
|
+
with ux_utils.print_exception_no_traceback():
|
|
1337
|
+
raise ValueError(
|
|
1338
|
+
'Lambda cloud only supports Docker images. '
|
|
1339
|
+
'Please prefix your image with "docker:" '
|
|
1340
|
+
'(e.g., image_id: docker:your-image-name).') from e
|
|
925
1341
|
with ux_utils.print_exception_no_traceback():
|
|
926
1342
|
raise ValueError(
|
|
927
1343
|
'image_id is only supported for AWS/GCP/Azure/IBM/OCI/'
|
|
928
|
-
'Kubernetes,
|
|
1344
|
+
'Kubernetes. For Lambda cloud, use "docker:" prefix for '
|
|
1345
|
+
'Docker images.') from e
|
|
929
1346
|
|
|
930
1347
|
if self._region is not None:
|
|
931
|
-
|
|
1348
|
+
# If the image_id has None as key (region-agnostic),
|
|
1349
|
+
# use it for any region
|
|
1350
|
+
if None in self._image_id:
|
|
1351
|
+
# Replace None key with the actual region
|
|
1352
|
+
self._image_id = {self._region: self._image_id[None]}
|
|
1353
|
+
elif self._region not in self._image_id:
|
|
932
1354
|
with ux_utils.print_exception_no_traceback():
|
|
933
1355
|
raise ValueError(
|
|
934
1356
|
f'image_id {self._image_id} should contain the image '
|
|
935
1357
|
f'for the specified region {self._region}.')
|
|
936
|
-
|
|
937
|
-
|
|
1358
|
+
else:
|
|
1359
|
+
# Narrow down the image_id to the specified region.
|
|
1360
|
+
self._image_id = {self._region: self._image_id[self._region]}
|
|
938
1361
|
|
|
939
1362
|
# Check the image_id's are valid.
|
|
940
1363
|
for region, image_id in self._image_id.items():
|
|
941
1364
|
if (image_id.startswith('skypilot:') and
|
|
942
|
-
not self.
|
|
1365
|
+
not self.cloud.is_image_tag_valid(image_id, region)):
|
|
943
1366
|
region_str = f' ({region})' if region else ''
|
|
944
1367
|
with ux_utils.print_exception_no_traceback():
|
|
945
1368
|
raise ValueError(
|
|
946
1369
|
f'Image tag {image_id!r} is not valid, please make sure'
|
|
947
1370
|
f' the tag exists in {self._cloud}{region_str}.')
|
|
948
1371
|
|
|
949
|
-
if (self.
|
|
1372
|
+
if (self.cloud.is_same_cloud(clouds.AWS()) and
|
|
950
1373
|
not image_id.startswith('skypilot:') and region is None):
|
|
951
1374
|
with ux_utils.print_exception_no_traceback():
|
|
952
1375
|
raise ValueError(
|
|
@@ -984,6 +1407,47 @@ class Resources:
|
|
|
984
1407
|
f'Disk tier {self.disk_tier.value} is not supported '
|
|
985
1408
|
f'for instance type {self.instance_type}.') from None
|
|
986
1409
|
|
|
1410
|
+
def _try_validate_volumes(self) -> None:
|
|
1411
|
+
"""Try to validate the volumes attribute.
|
|
1412
|
+
Raises:
|
|
1413
|
+
ValueError: if the attribute is invalid.
|
|
1414
|
+
"""
|
|
1415
|
+
if self.volumes is None:
|
|
1416
|
+
return
|
|
1417
|
+
if self.cloud is None:
|
|
1418
|
+
with ux_utils.print_exception_no_traceback():
|
|
1419
|
+
raise ValueError('Cloud must be specified when '
|
|
1420
|
+
'volumes are provided.')
|
|
1421
|
+
if not self.cloud.is_same_cloud(clouds.GCP()):
|
|
1422
|
+
with ux_utils.print_exception_no_traceback():
|
|
1423
|
+
raise ValueError(f'Volumes are only supported for GCP'
|
|
1424
|
+
f' not for {self.cloud}.')
|
|
1425
|
+
|
|
1426
|
+
need_region_or_zone = False
|
|
1427
|
+
try:
|
|
1428
|
+
for volume in self.volumes:
|
|
1429
|
+
if ('name' in volume and volume['storage_type']
|
|
1430
|
+
== resources_utils.StorageType.NETWORK):
|
|
1431
|
+
need_region_or_zone = True
|
|
1432
|
+
if 'disk_tier' not in volume:
|
|
1433
|
+
continue
|
|
1434
|
+
# TODO(hailong): check instance local SSD
|
|
1435
|
+
# support for instance_type.
|
|
1436
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
|
|
1437
|
+
self.cloud.check_disk_tier_enabled(self.instance_type,
|
|
1438
|
+
volume['disk_tier'])
|
|
1439
|
+
if (need_region_or_zone and self._region is None and
|
|
1440
|
+
self._zone is None):
|
|
1441
|
+
with ux_utils.print_exception_no_traceback():
|
|
1442
|
+
raise ValueError('When specifying the volume name, please'
|
|
1443
|
+
' also specify the region or zone.')
|
|
1444
|
+
except exceptions.NotSupportedError:
|
|
1445
|
+
with ux_utils.print_exception_no_traceback():
|
|
1446
|
+
raise ValueError(
|
|
1447
|
+
f'Disk tier {volume["disk_tier"].value} is not '
|
|
1448
|
+
f'supported for instance type {self.instance_type}.'
|
|
1449
|
+
) from None
|
|
1450
|
+
|
|
987
1451
|
def _try_validate_ports(self) -> None:
|
|
988
1452
|
"""Try to validate the ports attribute.
|
|
989
1453
|
|
|
@@ -1051,6 +1515,9 @@ class Resources:
|
|
|
1051
1515
|
"""Returns cost in USD for the runtime in seconds."""
|
|
1052
1516
|
hours = seconds / 3600
|
|
1053
1517
|
# Instance.
|
|
1518
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
1519
|
+
assert self._instance_type is not None, (
|
|
1520
|
+
'Instance type must be specified')
|
|
1054
1521
|
hourly_cost = self.cloud.instance_type_to_hourly_cost(
|
|
1055
1522
|
self._instance_type, self.use_spot, self._region, self._zone)
|
|
1056
1523
|
# Accelerators (if any).
|
|
@@ -1071,11 +1538,15 @@ class Resources:
|
|
|
1071
1538
|
def get_spot_str(self) -> str:
|
|
1072
1539
|
return '[Spot]' if self.use_spot else ''
|
|
1073
1540
|
|
|
1074
|
-
def make_deploy_variables(
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1541
|
+
def make_deploy_variables(
|
|
1542
|
+
self,
|
|
1543
|
+
cluster_name: resources_utils.ClusterName,
|
|
1544
|
+
region: clouds.Region,
|
|
1545
|
+
zones: Optional[List[clouds.Zone]],
|
|
1546
|
+
num_nodes: int,
|
|
1547
|
+
dryrun: bool,
|
|
1548
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
1549
|
+
) -> Dict[str, Optional[str]]:
|
|
1079
1550
|
"""Converts planned sky.Resources to resource variables.
|
|
1080
1551
|
|
|
1081
1552
|
These variables are divided into two categories: cloud-specific and
|
|
@@ -1095,8 +1566,9 @@ class Resources:
|
|
|
1095
1566
|
docker_image = self.extract_docker_image()
|
|
1096
1567
|
|
|
1097
1568
|
# Cloud specific variables
|
|
1569
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
1098
1570
|
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
|
1099
|
-
self, cluster_name, region, zones, num_nodes, dryrun)
|
|
1571
|
+
self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
|
|
1100
1572
|
|
|
1101
1573
|
# TODO(andyl): Should we print some warnings if users' envs share
|
|
1102
1574
|
# same names with the cloud specific variables, but not enabled
|
|
@@ -1147,11 +1619,26 @@ class Resources:
|
|
|
1147
1619
|
# to each cloud if any cloud supports reservations for spot.
|
|
1148
1620
|
return {}
|
|
1149
1621
|
specific_reservations = set(
|
|
1150
|
-
skypilot_config.
|
|
1151
|
-
|
|
1622
|
+
skypilot_config.get_effective_region_config(
|
|
1623
|
+
cloud=str(self.cloud).lower(),
|
|
1624
|
+
region=self.region,
|
|
1625
|
+
keys=('specific_reservations',),
|
|
1626
|
+
default_value=set()))
|
|
1627
|
+
|
|
1628
|
+
if isinstance(self.cloud, clouds.DummyCloud):
|
|
1629
|
+
return self.cloud.get_reservations_available_resources(
|
|
1630
|
+
instance_type='',
|
|
1631
|
+
region='',
|
|
1632
|
+
zone=None,
|
|
1633
|
+
specific_reservations=specific_reservations)
|
|
1634
|
+
|
|
1635
|
+
assert (self.cloud is not None and self.instance_type is not None and
|
|
1636
|
+
self.region is not None), (
|
|
1637
|
+
f'Cloud, instance type, region must be specified. '
|
|
1638
|
+
f'Resources={self}, cloud={self.cloud}, '
|
|
1639
|
+
f'instance_type={self.instance_type}, region={self.region}')
|
|
1152
1640
|
return self.cloud.get_reservations_available_resources(
|
|
1153
|
-
self.
|
|
1154
|
-
specific_reservations)
|
|
1641
|
+
self.instance_type, self.region, self.zone, specific_reservations)
|
|
1155
1642
|
|
|
1156
1643
|
def less_demanding_than(
|
|
1157
1644
|
self,
|
|
@@ -1171,6 +1658,9 @@ class Resources:
|
|
|
1171
1658
|
if isinstance(other, list):
|
|
1172
1659
|
resources_list = [self.less_demanding_than(o) for o in other]
|
|
1173
1660
|
return requested_num_nodes <= sum(resources_list)
|
|
1661
|
+
|
|
1662
|
+
assert other.cloud is not None, 'Other cloud must be specified'
|
|
1663
|
+
|
|
1174
1664
|
if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
|
|
1175
1665
|
return False
|
|
1176
1666
|
# self.cloud <= other.cloud
|
|
@@ -1234,6 +1724,12 @@ class Resources:
|
|
|
1234
1724
|
if not (self.disk_tier <= other.disk_tier): # pylint: disable=superfluous-parens
|
|
1235
1725
|
return False
|
|
1236
1726
|
|
|
1727
|
+
if self.network_tier is not None:
|
|
1728
|
+
if other.network_tier is None:
|
|
1729
|
+
return False
|
|
1730
|
+
if not self.network_tier <= other.network_tier:
|
|
1731
|
+
return False
|
|
1732
|
+
|
|
1237
1733
|
if check_ports:
|
|
1238
1734
|
if self.ports is not None:
|
|
1239
1735
|
if other.ports is None:
|
|
@@ -1259,6 +1755,7 @@ class Resources:
|
|
|
1259
1755
|
If a field in `blocked` is None, it should be considered as a wildcard
|
|
1260
1756
|
for that field.
|
|
1261
1757
|
"""
|
|
1758
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
|
1262
1759
|
is_matched = True
|
|
1263
1760
|
if (blocked.cloud is not None and
|
|
1264
1761
|
not self.cloud.is_same_cloud(blocked.cloud)):
|
|
@@ -1273,6 +1770,8 @@ class Resources:
|
|
|
1273
1770
|
if (blocked.accelerators is not None and
|
|
1274
1771
|
self.accelerators != blocked.accelerators):
|
|
1275
1772
|
is_matched = False
|
|
1773
|
+
if blocked.use_spot is not None and self.use_spot != blocked.use_spot:
|
|
1774
|
+
is_matched = False
|
|
1276
1775
|
return is_matched
|
|
1277
1776
|
|
|
1278
1777
|
def is_empty(self) -> bool:
|
|
@@ -1285,8 +1784,9 @@ class Resources:
|
|
|
1285
1784
|
self._accelerators is None,
|
|
1286
1785
|
self._accelerator_args is None,
|
|
1287
1786
|
not self._use_spot_specified,
|
|
1288
|
-
self._disk_size ==
|
|
1787
|
+
self._disk_size == DEFAULT_DISK_SIZE_GB,
|
|
1289
1788
|
self._disk_tier is None,
|
|
1789
|
+
self._network_tier is None,
|
|
1290
1790
|
self._image_id is None,
|
|
1291
1791
|
self._ports is None,
|
|
1292
1792
|
self._docker_login_config is None,
|
|
@@ -1297,7 +1797,7 @@ class Resources:
|
|
|
1297
1797
|
use_spot = self.use_spot if self._use_spot_specified else None
|
|
1298
1798
|
|
|
1299
1799
|
current_override_configs = self._cluster_config_overrides
|
|
1300
|
-
if
|
|
1800
|
+
if current_override_configs is None:
|
|
1301
1801
|
current_override_configs = {}
|
|
1302
1802
|
new_override_configs = override.pop('_cluster_config_overrides', {})
|
|
1303
1803
|
overlaid_configs = skypilot_config.overlay_skypilot_config(
|
|
@@ -1310,6 +1810,10 @@ class Resources:
|
|
|
1310
1810
|
if elem is not None:
|
|
1311
1811
|
override_configs.set_nested(key, elem)
|
|
1312
1812
|
|
|
1813
|
+
current_autostop_config = None
|
|
1814
|
+
if self.autostop_config is not None:
|
|
1815
|
+
current_autostop_config = self.autostop_config.to_yaml_config()
|
|
1816
|
+
|
|
1313
1817
|
override_configs = dict(override_configs) if override_configs else None
|
|
1314
1818
|
resources = Resources(
|
|
1315
1819
|
cloud=override.pop('cloud', self.cloud),
|
|
@@ -1326,8 +1830,13 @@ class Resources:
|
|
|
1326
1830
|
zone=override.pop('zone', self.zone),
|
|
1327
1831
|
image_id=override.pop('image_id', self.image_id),
|
|
1328
1832
|
disk_tier=override.pop('disk_tier', self.disk_tier),
|
|
1833
|
+
network_tier=override.pop('network_tier', self.network_tier),
|
|
1329
1834
|
ports=override.pop('ports', self.ports),
|
|
1330
1835
|
labels=override.pop('labels', self.labels),
|
|
1836
|
+
autostop=override.pop('autostop', current_autostop_config),
|
|
1837
|
+
priority=override.pop('priority', self.priority),
|
|
1838
|
+
volumes=override.pop('volumes', self.volumes),
|
|
1839
|
+
infra=override.pop('infra', None),
|
|
1331
1840
|
_docker_login_config=override.pop('_docker_login_config',
|
|
1332
1841
|
self._docker_login_config),
|
|
1333
1842
|
_docker_username_for_runpod=override.pop(
|
|
@@ -1337,6 +1846,8 @@ class Resources:
|
|
|
1337
1846
|
self._is_image_managed),
|
|
1338
1847
|
_requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
|
|
1339
1848
|
_cluster_config_overrides=override_configs,
|
|
1849
|
+
_no_missing_accel_warnings=override.pop(
|
|
1850
|
+
'no_missing_accel_warnings', self._no_missing_accel_warnings),
|
|
1340
1851
|
)
|
|
1341
1852
|
assert not override
|
|
1342
1853
|
return resources
|
|
@@ -1361,12 +1872,21 @@ class Resources:
|
|
|
1361
1872
|
if (self.disk_tier is not None and
|
|
1362
1873
|
self.disk_tier != resources_utils.DiskTier.BEST):
|
|
1363
1874
|
features.add(clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
|
1875
|
+
if (self.network_tier is not None and
|
|
1876
|
+
self.network_tier == resources_utils.NetworkTier.BEST):
|
|
1877
|
+
features.add(clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER)
|
|
1364
1878
|
if self.extract_docker_image() is not None:
|
|
1365
1879
|
features.add(clouds.CloudImplementationFeatures.DOCKER_IMAGE)
|
|
1366
1880
|
elif self.image_id is not None:
|
|
1367
1881
|
features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
|
|
1368
1882
|
if self.ports is not None:
|
|
1369
1883
|
features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
|
|
1884
|
+
if self.volumes is not None:
|
|
1885
|
+
for volume in self.volumes:
|
|
1886
|
+
if 'disk_tier' in volume and volume[
|
|
1887
|
+
'disk_tier'] != resources_utils.DiskTier.BEST:
|
|
1888
|
+
features.add(
|
|
1889
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
|
1370
1890
|
return features
|
|
1371
1891
|
|
|
1372
1892
|
@staticmethod
|
|
@@ -1393,10 +1913,75 @@ class Resources:
|
|
|
1393
1913
|
config[canonical] = config[alias]
|
|
1394
1914
|
del config[alias]
|
|
1395
1915
|
|
|
1916
|
+
@classmethod
|
|
1917
|
+
def _parse_accelerators_from_str(
|
|
1918
|
+
cls, accelerators: str) -> List[Tuple[str, bool]]:
|
|
1919
|
+
"""Parse accelerators string into a list of possible accelerators.
|
|
1920
|
+
|
|
1921
|
+
Returns:
|
|
1922
|
+
A list of possible accelerators. Each element is a tuple of
|
|
1923
|
+
(accelerator_name, was_user_specified). was_user_specified is True
|
|
1924
|
+
if the accelerator was directly named by the user (for example
|
|
1925
|
+
"H100:2" would be True, but "80GB+" would be False since it doesn't
|
|
1926
|
+
mention the name of the accelerator).
|
|
1927
|
+
"""
|
|
1928
|
+
# sanity check
|
|
1929
|
+
assert isinstance(accelerators, str), accelerators
|
|
1930
|
+
|
|
1931
|
+
manufacturer = None
|
|
1932
|
+
memory = None
|
|
1933
|
+
count = 1
|
|
1934
|
+
|
|
1935
|
+
split = accelerators.split(':')
|
|
1936
|
+
if len(split) == 3:
|
|
1937
|
+
manufacturer, memory, count_str = split
|
|
1938
|
+
count = int(count_str)
|
|
1939
|
+
assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
|
|
1940
|
+
'If specifying a GPU manufacturer, you must also' \
|
|
1941
|
+
'specify the memory size'
|
|
1942
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
|
1943
|
+
memory = split[0]
|
|
1944
|
+
count = int(split[1])
|
|
1945
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
|
|
1946
|
+
manufacturer, memory = split
|
|
1947
|
+
elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
|
1948
|
+
memory = split[0]
|
|
1949
|
+
else:
|
|
1950
|
+
# it is just an accelerator name, not a memory size
|
|
1951
|
+
return [(accelerators, True)]
|
|
1952
|
+
|
|
1953
|
+
# we know we have some case of manufacturer, memory, count, now we
|
|
1954
|
+
# need to convert that to a list of possible accelerators
|
|
1955
|
+
memory_parsed = resources_utils.parse_memory_resource(memory,
|
|
1956
|
+
'accelerators',
|
|
1957
|
+
allow_plus=True)
|
|
1958
|
+
plus = memory_parsed[-1] == '+'
|
|
1959
|
+
if plus:
|
|
1960
|
+
memory_parsed = memory_parsed[:-1]
|
|
1961
|
+
memory_gb = int(memory_parsed)
|
|
1962
|
+
|
|
1963
|
+
accelerators = [
|
|
1964
|
+
(f'{device}:{count}', False)
|
|
1965
|
+
for device in accelerator_registry.get_devices_by_memory(
|
|
1966
|
+
memory_gb, plus, manufacturer=manufacturer)
|
|
1967
|
+
]
|
|
1968
|
+
|
|
1969
|
+
return accelerators
|
|
1970
|
+
|
|
1396
1971
|
@classmethod
|
|
1397
1972
|
def from_yaml_config(
|
|
1398
1973
|
cls, config: Optional[Dict[str, Any]]
|
|
1399
1974
|
) -> Union[Set['Resources'], List['Resources']]:
|
|
1975
|
+
"""Creates Resources objects from a YAML config.
|
|
1976
|
+
|
|
1977
|
+
Args:
|
|
1978
|
+
config: A dict of resource config.
|
|
1979
|
+
|
|
1980
|
+
Returns:
|
|
1981
|
+
A set of Resources objects if any_of is specified, otherwise a list
|
|
1982
|
+
of Resources objects if ordered is specified, otherwise a set with
|
|
1983
|
+
a single Resources object.
|
|
1984
|
+
"""
|
|
1400
1985
|
if config is None:
|
|
1401
1986
|
return {Resources()}
|
|
1402
1987
|
|
|
@@ -1453,13 +2038,48 @@ class Resources:
|
|
|
1453
2038
|
accelerators = config.get('accelerators')
|
|
1454
2039
|
if config and accelerators is not None:
|
|
1455
2040
|
if isinstance(accelerators, str):
|
|
1456
|
-
|
|
2041
|
+
accelerators_list = cls._parse_accelerators_from_str(
|
|
2042
|
+
accelerators)
|
|
1457
2043
|
elif isinstance(accelerators, dict):
|
|
1458
|
-
|
|
2044
|
+
accelerator_names = [
|
|
1459
2045
|
f'{k}:{v}' if v is not None else f'{k}'
|
|
1460
2046
|
for k, v in accelerators.items()
|
|
1461
2047
|
]
|
|
1462
|
-
|
|
2048
|
+
accelerators_list = []
|
|
2049
|
+
for accel_name in accelerator_names:
|
|
2050
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
|
2051
|
+
accelerators_list.extend(parsed_accels)
|
|
2052
|
+
elif isinstance(accelerators, list) or isinstance(
|
|
2053
|
+
accelerators, set):
|
|
2054
|
+
accelerators_list = []
|
|
2055
|
+
for accel_name in accelerators:
|
|
2056
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
|
2057
|
+
accelerators_list.extend(parsed_accels)
|
|
2058
|
+
else:
|
|
2059
|
+
assert False, ('Invalid accelerators type:'
|
|
2060
|
+
f'{type(accelerators)}')
|
|
2061
|
+
# now that accelerators is a list, we need to decide which to
|
|
2062
|
+
# include in the final set, however, there may be multiple copies
|
|
2063
|
+
# of the same accelerator, some given by name by the user and the
|
|
2064
|
+
# other copy being given by memory size. In this case, we only care
|
|
2065
|
+
# about the user specified ones (so we can give a warning if it
|
|
2066
|
+
# doesn't exist).
|
|
2067
|
+
accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
|
|
2068
|
+
for accel, user_specified in accelerators_list:
|
|
2069
|
+
# If this accelerator is not in dict yet, or if current one is
|
|
2070
|
+
# user specified and existing one is not, update the entry
|
|
2071
|
+
accel_to_user_specified[accel] = (user_specified or
|
|
2072
|
+
accel_to_user_specified.get(
|
|
2073
|
+
accel, False))
|
|
2074
|
+
|
|
2075
|
+
# only time we care about ordered is when we are given a list,
|
|
2076
|
+
# otherwise we default to a set
|
|
2077
|
+
accelerators_type = list if isinstance(accelerators, list) else set
|
|
2078
|
+
accelerators = accelerators_type([
|
|
2079
|
+
(accel, user_specified)
|
|
2080
|
+
for accel, user_specified in accel_to_user_specified.items()
|
|
2081
|
+
])
|
|
2082
|
+
|
|
1463
2083
|
if len(accelerators) > 1 and ordered_configs:
|
|
1464
2084
|
with ux_utils.print_exception_no_traceback():
|
|
1465
2085
|
raise ValueError(
|
|
@@ -1469,7 +2089,7 @@ class Resources:
|
|
|
1469
2089
|
not isinstance(accelerators, set)):
|
|
1470
2090
|
with ux_utils.print_exception_no_traceback():
|
|
1471
2091
|
raise ValueError(
|
|
1472
|
-
'Cannot specify multiple "accelerators" with
|
|
2092
|
+
'Cannot specify multiple "accelerators" with preferred '
|
|
1473
2093
|
'order (i.e., list of accelerators) with "any_of" '
|
|
1474
2094
|
'in resources.')
|
|
1475
2095
|
|
|
@@ -1485,23 +2105,35 @@ class Resources:
|
|
|
1485
2105
|
# In Task, we store a list of resources, each with 1 accelerator.
|
|
1486
2106
|
# This for loop is for format conversion.
|
|
1487
2107
|
tmp_resources_list = []
|
|
1488
|
-
for acc in accelerators:
|
|
2108
|
+
for acc, user_specified in accelerators:
|
|
1489
2109
|
tmp_resource = config.copy()
|
|
1490
2110
|
tmp_resource['accelerators'] = acc
|
|
2111
|
+
if not user_specified:
|
|
2112
|
+
tmp_resource['_no_missing_accel_warnings'] = True
|
|
1491
2113
|
tmp_resources_list.append(
|
|
1492
2114
|
Resources._from_yaml_config_single(tmp_resource))
|
|
1493
2115
|
|
|
1494
2116
|
assert isinstance(accelerators, (list, set)), accelerators
|
|
1495
2117
|
return type(accelerators)(tmp_resources_list)
|
|
1496
|
-
|
|
1497
2118
|
return {Resources._from_yaml_config_single(config)}
|
|
1498
2119
|
|
|
1499
2120
|
@classmethod
|
|
1500
2121
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
|
2122
|
+
resources_fields: Dict[str, Any] = {}
|
|
2123
|
+
|
|
2124
|
+
# Extract infra field if present
|
|
2125
|
+
infra = config.pop('infra', None)
|
|
2126
|
+
resources_fields['infra'] = infra
|
|
1501
2127
|
|
|
1502
|
-
|
|
2128
|
+
# Keep backward compatibility with cloud, region, zone
|
|
2129
|
+
# Note: if both `infra` and any of `cloud`, `region`, `zone` are
|
|
2130
|
+
# specified, it will raise an error during the Resources.__init__
|
|
2131
|
+
# validation.
|
|
1503
2132
|
resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
|
|
1504
2133
|
config.pop('cloud', None))
|
|
2134
|
+
resources_fields['region'] = config.pop('region', None)
|
|
2135
|
+
resources_fields['zone'] = config.pop('zone', None)
|
|
2136
|
+
|
|
1505
2137
|
resources_fields['instance_type'] = config.pop('instance_type', None)
|
|
1506
2138
|
resources_fields['cpus'] = config.pop('cpus', None)
|
|
1507
2139
|
resources_fields['memory'] = config.pop('memory', None)
|
|
@@ -1519,12 +2151,14 @@ class Resources:
|
|
|
1519
2151
|
# exclusive by the schema validation.
|
|
1520
2152
|
resources_fields['job_recovery'] = config.pop('job_recovery', None)
|
|
1521
2153
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
|
1522
|
-
resources_fields['region'] = config.pop('region', None)
|
|
1523
|
-
resources_fields['zone'] = config.pop('zone', None)
|
|
1524
2154
|
resources_fields['image_id'] = config.pop('image_id', None)
|
|
1525
2155
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
|
2156
|
+
resources_fields['network_tier'] = config.pop('network_tier', None)
|
|
1526
2157
|
resources_fields['ports'] = config.pop('ports', None)
|
|
1527
2158
|
resources_fields['labels'] = config.pop('labels', None)
|
|
2159
|
+
resources_fields['autostop'] = config.pop('autostop', None)
|
|
2160
|
+
resources_fields['priority'] = config.pop('priority', None)
|
|
2161
|
+
resources_fields['volumes'] = config.pop('volumes', None)
|
|
1528
2162
|
resources_fields['_docker_login_config'] = config.pop(
|
|
1529
2163
|
'_docker_login_config', None)
|
|
1530
2164
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
|
@@ -1543,7 +2177,11 @@ class Resources:
|
|
|
1543
2177
|
resources_fields['accelerator_args'] = dict(
|
|
1544
2178
|
resources_fields['accelerator_args'])
|
|
1545
2179
|
if resources_fields['disk_size'] is not None:
|
|
1546
|
-
|
|
2180
|
+
# although it will end up being an int, we don't know at this point
|
|
2181
|
+
# if it has units or not, so we store it as a string
|
|
2182
|
+
resources_fields['disk_size'] = str(resources_fields['disk_size'])
|
|
2183
|
+
resources_fields['_no_missing_accel_warnings'] = config.pop(
|
|
2184
|
+
'_no_missing_accel_warnings', None)
|
|
1547
2185
|
|
|
1548
2186
|
assert not config, f'Invalid resource args: {config.keys()}'
|
|
1549
2187
|
return Resources(**resources_fields)
|
|
@@ -1556,7 +2194,10 @@ class Resources:
|
|
|
1556
2194
|
if value is not None and value != 'None':
|
|
1557
2195
|
config[key] = value
|
|
1558
2196
|
|
|
1559
|
-
|
|
2197
|
+
# Construct infra field if cloud is set
|
|
2198
|
+
infra = self.infra.to_str()
|
|
2199
|
+
add_if_not_none('infra', infra)
|
|
2200
|
+
|
|
1560
2201
|
add_if_not_none('instance_type', self.instance_type)
|
|
1561
2202
|
add_if_not_none('cpus', self._cpus)
|
|
1562
2203
|
add_if_not_none('memory', self.memory)
|
|
@@ -1567,13 +2208,34 @@ class Resources:
|
|
|
1567
2208
|
add_if_not_none('use_spot', self.use_spot)
|
|
1568
2209
|
add_if_not_none('job_recovery', self.job_recovery)
|
|
1569
2210
|
add_if_not_none('disk_size', self.disk_size)
|
|
1570
|
-
add_if_not_none('region', self.region)
|
|
1571
|
-
add_if_not_none('zone', self.zone)
|
|
1572
2211
|
add_if_not_none('image_id', self.image_id)
|
|
1573
2212
|
if self.disk_tier is not None:
|
|
1574
2213
|
config['disk_tier'] = self.disk_tier.value
|
|
2214
|
+
if self.network_tier is not None:
|
|
2215
|
+
config['network_tier'] = self.network_tier.value
|
|
1575
2216
|
add_if_not_none('ports', self.ports)
|
|
1576
2217
|
add_if_not_none('labels', self.labels)
|
|
2218
|
+
if self.volumes is not None:
|
|
2219
|
+
# Convert DiskTier/StorageType enum to string value for each volume
|
|
2220
|
+
volumes = []
|
|
2221
|
+
for volume in self.volumes:
|
|
2222
|
+
volume_copy = volume.copy()
|
|
2223
|
+
if 'disk_tier' in volume_copy:
|
|
2224
|
+
volume_copy['disk_tier'] = volume_copy['disk_tier'].value
|
|
2225
|
+
if 'storage_type' in volume_copy:
|
|
2226
|
+
volume_copy['storage_type'] = volume_copy[
|
|
2227
|
+
'storage_type'].value
|
|
2228
|
+
if 'attach_mode' in volume_copy:
|
|
2229
|
+
volume_copy['attach_mode'] = volume_copy[
|
|
2230
|
+
'attach_mode'].value
|
|
2231
|
+
volumes.append(volume_copy)
|
|
2232
|
+
config['volumes'] = volumes
|
|
2233
|
+
if self._autostop_config is not None:
|
|
2234
|
+
config['autostop'] = self._autostop_config.to_yaml_config()
|
|
2235
|
+
|
|
2236
|
+
add_if_not_none('_no_missing_accel_warnings',
|
|
2237
|
+
self._no_missing_accel_warnings)
|
|
2238
|
+
add_if_not_none('priority', self.priority)
|
|
1577
2239
|
if self._docker_login_config is not None:
|
|
1578
2240
|
config['_docker_login_config'] = dataclasses.asdict(
|
|
1579
2241
|
self._docker_login_config)
|
|
@@ -1611,7 +2273,7 @@ class Resources:
|
|
|
1611
2273
|
accelerator_args = state.pop('accelerator_args', None)
|
|
1612
2274
|
state['_accelerator_args'] = accelerator_args
|
|
1613
2275
|
|
|
1614
|
-
disk_size = state.pop('disk_size',
|
|
2276
|
+
disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
|
|
1615
2277
|
state['_disk_size'] = disk_size
|
|
1616
2278
|
|
|
1617
2279
|
if version < 2:
|
|
@@ -1729,4 +2391,68 @@ class Resources:
|
|
|
1729
2391
|
self._docker_username_for_runpod = state.pop(
|
|
1730
2392
|
'_docker_username_for_runpod', None)
|
|
1731
2393
|
|
|
2394
|
+
if version < 23:
|
|
2395
|
+
self._autostop_config = None
|
|
2396
|
+
|
|
2397
|
+
if version < 24:
|
|
2398
|
+
self._volumes = None
|
|
2399
|
+
|
|
2400
|
+
if version < 25:
|
|
2401
|
+
if isinstance(state.get('_cloud', None), clouds.Kubernetes):
|
|
2402
|
+
_maybe_add_docker_prefix_to_image_id(state['_image_id'])
|
|
2403
|
+
|
|
2404
|
+
if version < 26:
|
|
2405
|
+
self._network_tier = state.get('_network_tier', None)
|
|
2406
|
+
|
|
2407
|
+
if version < 27:
|
|
2408
|
+
self._priority = None
|
|
2409
|
+
|
|
2410
|
+
if version < 28:
|
|
2411
|
+
self._no_missing_accel_warnings = state.get(
|
|
2412
|
+
'_no_missing_accel_warnings', None)
|
|
2413
|
+
|
|
1732
2414
|
self.__dict__.update(state)
|
|
2415
|
+
|
|
2416
|
+
|
|
2417
|
+
class LaunchableResources(Resources):
|
|
2418
|
+
"""A class representing resources that can be launched on a cloud provider.
|
|
2419
|
+
|
|
2420
|
+
This class is primarily a type hint for MyPy to indicate that an instance
|
|
2421
|
+
of `Resources` is launchable (i.e., `cloud` and `instance_type` are not
|
|
2422
|
+
None). It should not be instantiated directly.
|
|
2423
|
+
"""
|
|
2424
|
+
|
|
2425
|
+
def __init__(self, *args, **kwargs) -> None: # pylint: disable=super-init-not-called,unused-argument
|
|
2426
|
+
assert False, (
|
|
2427
|
+
'LaunchableResources should not be instantiated directly. '
|
|
2428
|
+
'It is only used for type checking by MyPy.')
|
|
2429
|
+
|
|
2430
|
+
@property
|
|
2431
|
+
def cloud(self) -> clouds.Cloud:
|
|
2432
|
+
assert self._cloud is not None, 'Cloud must be specified'
|
|
2433
|
+
return self._cloud
|
|
2434
|
+
|
|
2435
|
+
@property
|
|
2436
|
+
def instance_type(self) -> str:
|
|
2437
|
+
assert self._instance_type is not None, (
|
|
2438
|
+
'Instance type must be specified')
|
|
2439
|
+
return self._instance_type
|
|
2440
|
+
|
|
2441
|
+
def copy(self, **override) -> 'LaunchableResources':
|
|
2442
|
+
"""Ensure MyPy understands the return type is LaunchableResources.
|
|
2443
|
+
|
|
2444
|
+
This method is not expected to be called at runtime, as
|
|
2445
|
+
LaunchableResources should not be directly instantiated. It primarily
|
|
2446
|
+
serves as a type hint for static analysis.
|
|
2447
|
+
"""
|
|
2448
|
+
self.assert_launchable()
|
|
2449
|
+
return typing.cast(LaunchableResources, super().copy(**override))
|
|
2450
|
+
|
|
2451
|
+
|
|
2452
|
+
def _maybe_add_docker_prefix_to_image_id(
|
|
2453
|
+
image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
|
|
2454
|
+
if image_id_dict is None:
|
|
2455
|
+
return
|
|
2456
|
+
for k, v in image_id_dict.items():
|
|
2457
|
+
if not v.startswith('docker:'):
|
|
2458
|
+
image_id_dict[k] = f'docker:{v}'
|