skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Hyperbolic Cloud service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads and queries the service catalog for Hyperbolic Cloud.
|
|
4
|
+
"""
|
|
5
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from sky.catalog import common
|
|
8
|
+
from sky.clouds import cloud # Import cloud here for Region
|
|
9
|
+
from sky.utils import ux_utils
|
|
10
|
+
|
|
11
|
+
# Initialize cloud variable at module level
|
|
12
|
+
CLOUD = 'hyperbolic'
|
|
13
|
+
|
|
14
|
+
_df = common.read_catalog('hyperbolic/vms.csv')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
18
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def validate_region_zone(
|
|
22
|
+
region: Optional[str],
|
|
23
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
24
|
+
if zone is not None:
|
|
25
|
+
with ux_utils.print_exception_no_traceback():
|
|
26
|
+
raise ValueError('Hyperbolic Cloud does not support zones.')
|
|
27
|
+
return common.validate_region_zone_impl('hyperbolic', _df, region, zone)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_hourly_cost(
|
|
31
|
+
instance_type: str,
|
|
32
|
+
use_spot: bool = False,
|
|
33
|
+
region: Optional[str] = None,
|
|
34
|
+
zone: Optional[str] = None,
|
|
35
|
+
) -> float:
|
|
36
|
+
if zone is not None:
|
|
37
|
+
with ux_utils.print_exception_no_traceback():
|
|
38
|
+
raise ValueError('Hyperbolic Cloud does not support zones.')
|
|
39
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
40
|
+
zone)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_vcpus_mem_from_instance_type(
|
|
44
|
+
instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
|
|
45
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_accelerators_from_instance_type(
|
|
49
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
50
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_vcpus_from_instance_type(instance_type: str) -> Optional[float]:
|
|
54
|
+
vcpus, _ = get_vcpus_mem_from_instance_type(instance_type)
|
|
55
|
+
return vcpus
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_memory_from_instance_type(instance_type: str) -> Optional[float]:
|
|
59
|
+
_, mem = get_vcpus_mem_from_instance_type(instance_type)
|
|
60
|
+
return mem
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_zone_shell_cmd() -> Optional[str]:
|
|
64
|
+
"""Returns the shell command to obtain the zone."""
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
69
|
+
memory: Optional[str] = None,
|
|
70
|
+
disk_tier: Optional[str] = None,
|
|
71
|
+
region: Optional[str] = None,
|
|
72
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
73
|
+
del disk_tier # Unused
|
|
74
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
75
|
+
zone)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_instance_type_for_accelerator(
|
|
79
|
+
acc_name: str,
|
|
80
|
+
acc_count: int,
|
|
81
|
+
cpus: Optional[str] = None,
|
|
82
|
+
memory: Optional[str] = None,
|
|
83
|
+
use_spot: bool = False,
|
|
84
|
+
region: Optional[str] = None,
|
|
85
|
+
zone: Optional[str] = None,
|
|
86
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
|
87
|
+
if zone is not None:
|
|
88
|
+
with ux_utils.print_exception_no_traceback():
|
|
89
|
+
raise ValueError('Hyperbolic Cloud does not support zones.')
|
|
90
|
+
return common.get_instance_type_for_accelerator_impl(df=_df,
|
|
91
|
+
acc_name=acc_name,
|
|
92
|
+
acc_count=acc_count,
|
|
93
|
+
cpus=cpus,
|
|
94
|
+
memory=memory,
|
|
95
|
+
use_spot=use_spot,
|
|
96
|
+
region=region,
|
|
97
|
+
zone=zone)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
101
|
+
use_spot: bool) -> List[cloud.Region]:
|
|
102
|
+
df = _df[_df['InstanceType'] == instance_type]
|
|
103
|
+
return common.get_region_zones(df, use_spot)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_gen_version(instance_type: str) -> Optional[str]:
|
|
107
|
+
"""Returns the generation version of the instance type."""
|
|
108
|
+
del instance_type # Unused
|
|
109
|
+
# TODO: Implement generation version detection
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def list_accelerators(
|
|
114
|
+
gpus_only: bool = True,
|
|
115
|
+
name_filter: Optional[str] = None,
|
|
116
|
+
region_filter: Optional[str] = None,
|
|
117
|
+
quantity_filter: Optional[int] = None,
|
|
118
|
+
case_sensitive: bool = True,
|
|
119
|
+
all_regions: bool = False,
|
|
120
|
+
require_price: bool = True,
|
|
121
|
+
) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
122
|
+
"""Returns all instance types in Hyperbolic Cloud offering accelerators."""
|
|
123
|
+
del require_price # Unused
|
|
124
|
+
return common.list_accelerators_impl('Hyperbolic', _df, gpus_only,
|
|
125
|
+
name_filter, region_filter,
|
|
126
|
+
quantity_filter, case_sensitive,
|
|
127
|
+
all_regions)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_instance_type_from_catalog() -> dict:
|
|
131
|
+
# TODO: Implement this function
|
|
132
|
+
return {}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def regions() -> List[cloud.Region]:
|
|
136
|
+
return [cloud.Region('default')]
|
|
@@ -8,8 +8,8 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import ibm
|
|
11
|
+
from sky.catalog import common
|
|
11
12
|
from sky.clouds import cloud
|
|
12
|
-
from sky.clouds.service_catalog import common
|
|
13
13
|
from sky.utils import resources_utils
|
|
14
14
|
|
|
15
15
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -92,10 +92,12 @@ def list_accelerators(
|
|
|
92
92
|
case_sensitive, all_regions)
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def get_default_instance_type(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
96
|
+
memory: Optional[str] = None,
|
|
97
|
+
disk_tier: Optional[
|
|
98
|
+
resources_utils.DiskTier] = None,
|
|
99
|
+
region: Optional[str] = None,
|
|
100
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
99
101
|
del disk_tier # unused
|
|
100
102
|
if cpus is None and memory is None:
|
|
101
103
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
@@ -107,7 +109,8 @@ def get_default_instance_type(
|
|
|
107
109
|
instance_type_prefix = f'{_DEFAULT_INSTANCE_FAMILY}-'
|
|
108
110
|
df = _df[_df['InstanceType'].str.startswith(instance_type_prefix)]
|
|
109
111
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
110
|
-
memory_gb_or_ratio
|
|
112
|
+
memory_gb_or_ratio,
|
|
113
|
+
region, zone)
|
|
111
114
|
|
|
112
115
|
|
|
113
116
|
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Kubernetes does not require a catalog of instances, but we need an image catalog
|
|
4
4
|
mapping SkyPilot image tags to corresponding container image tags.
|
|
5
5
|
"""
|
|
6
|
+
import collections
|
|
6
7
|
import re
|
|
7
8
|
import typing
|
|
8
9
|
from typing import Dict, List, Optional, Set, Tuple
|
|
@@ -12,9 +13,9 @@ from sky import clouds as sky_clouds
|
|
|
12
13
|
from sky import sky_logging
|
|
13
14
|
from sky.adaptors import common as adaptors_common
|
|
14
15
|
from sky.adaptors import kubernetes
|
|
16
|
+
from sky.catalog import CloudFilter
|
|
17
|
+
from sky.catalog import common
|
|
15
18
|
from sky.clouds import cloud
|
|
16
|
-
from sky.clouds.service_catalog import CloudFilter
|
|
17
|
-
from sky.clouds.service_catalog import common
|
|
18
19
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
19
20
|
|
|
20
21
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -167,12 +168,25 @@ def _list_accelerators(
|
|
|
167
168
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
|
168
169
|
keys = lf.get_label_keys()
|
|
169
170
|
nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
|
|
170
|
-
|
|
171
|
-
if
|
|
172
|
-
|
|
171
|
+
|
|
172
|
+
# Check if any nodes have accelerators before fetching pods
|
|
173
|
+
has_accelerator_nodes = False
|
|
174
|
+
for node in nodes:
|
|
175
|
+
for key in keys:
|
|
176
|
+
if key in node.metadata.labels:
|
|
177
|
+
has_accelerator_nodes = True
|
|
178
|
+
break
|
|
179
|
+
if has_accelerator_nodes:
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
# Only fetch pods if we have accelerator nodes and realtime is requested
|
|
183
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
184
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
185
|
+
if realtime and has_accelerator_nodes:
|
|
186
|
+
# Get the allocated GPU quantity by each node
|
|
173
187
|
try:
|
|
174
|
-
|
|
175
|
-
context=context)
|
|
188
|
+
allocated_qty_by_node = (
|
|
189
|
+
kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
|
|
176
190
|
except kubernetes.api_exception() as e:
|
|
177
191
|
if e.status == 403:
|
|
178
192
|
logger.warning(
|
|
@@ -180,6 +194,7 @@ def _list_accelerators(
|
|
|
180
194
|
'(forbidden). Please check if your account has '
|
|
181
195
|
'necessary permissions to list pods. Realtime GPU '
|
|
182
196
|
'availability information may be incorrect.')
|
|
197
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
183
198
|
else:
|
|
184
199
|
raise
|
|
185
200
|
# Total number of GPUs in the cluster
|
|
@@ -191,10 +206,13 @@ def _list_accelerators(
|
|
|
191
206
|
for node in nodes:
|
|
192
207
|
for key in keys:
|
|
193
208
|
if key in node.metadata.labels:
|
|
194
|
-
allocated_qty = 0
|
|
195
209
|
accelerator_name = lf.get_accelerator_from_label_value(
|
|
196
210
|
node.metadata.labels.get(key))
|
|
197
211
|
|
|
212
|
+
# Heterogenous cluster may have some nodes with empty labels.
|
|
213
|
+
if not accelerator_name:
|
|
214
|
+
continue
|
|
215
|
+
|
|
198
216
|
# Exclude multi-host TPUs from being processed.
|
|
199
217
|
# TODO(Doyoung): Remove the logic when adding support for
|
|
200
218
|
# multi-host TPUs.
|
|
@@ -210,9 +228,9 @@ def _list_accelerators(
|
|
|
210
228
|
# Generate the accelerator quantities
|
|
211
229
|
accelerator_count = (
|
|
212
230
|
kubernetes_utils.get_node_accelerator_count(
|
|
213
|
-
node.status.allocatable))
|
|
231
|
+
context, node.status.allocatable))
|
|
214
232
|
|
|
215
|
-
if
|
|
233
|
+
if accelerator_count > 0:
|
|
216
234
|
# TPUs are counted in a different way compared to GPUs.
|
|
217
235
|
# Multi-node GPUs can be split into smaller units and be
|
|
218
236
|
# provisioned, but TPUs are considered as an atomic unit.
|
|
@@ -242,24 +260,18 @@ def _list_accelerators(
|
|
|
242
260
|
total_accelerators_capacity[
|
|
243
261
|
accelerator_name] += quantized_count
|
|
244
262
|
|
|
245
|
-
if
|
|
246
|
-
# If we can't get the
|
|
263
|
+
if error_on_get_allocated_gpu_qty_by_node:
|
|
264
|
+
# If we can't get the allocated GPU quantity by each node,
|
|
265
|
+
# we can't get the GPU usage.
|
|
247
266
|
total_accelerators_available[accelerator_name] = -1
|
|
248
267
|
continue
|
|
249
268
|
|
|
250
|
-
|
|
251
|
-
# Get all the pods running on the node
|
|
252
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
253
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
254
|
-
# Iterate over all the containers in the pod and sum
|
|
255
|
-
# the GPU requests
|
|
256
|
-
for container in pod.spec.containers:
|
|
257
|
-
if container.resources.requests:
|
|
258
|
-
allocated_qty += (
|
|
259
|
-
kubernetes_utils.get_node_accelerator_count(
|
|
260
|
-
container.resources.requests))
|
|
261
|
-
|
|
269
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
262
270
|
accelerators_available = accelerator_count - allocated_qty
|
|
271
|
+
# Initialize the total_accelerators_available to make sure the
|
|
272
|
+
# key exists in the dictionary.
|
|
273
|
+
total_accelerators_available[accelerator_name] = (
|
|
274
|
+
total_accelerators_available.get(accelerator_name, 0))
|
|
263
275
|
|
|
264
276
|
if accelerators_available >= min_quantity_filter:
|
|
265
277
|
quantized_availability = min_quantity_filter * (
|
|
@@ -6,7 +6,7 @@ instance types and pricing information for Lambda.
|
|
|
6
6
|
import typing
|
|
7
7
|
from typing import Dict, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from sky.
|
|
9
|
+
from sky.catalog import common
|
|
10
10
|
from sky.utils import resources_utils
|
|
11
11
|
from sky.utils import ux_utils
|
|
12
12
|
|
|
@@ -56,10 +56,12 @@ def get_vcpus_mem_from_instance_type(
|
|
|
56
56
|
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
def get_default_instance_type(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
59
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
60
|
+
memory: Optional[str] = None,
|
|
61
|
+
disk_tier: Optional[
|
|
62
|
+
resources_utils.DiskTier] = None,
|
|
63
|
+
region: Optional[str] = None,
|
|
64
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
63
65
|
del disk_tier # unused
|
|
64
66
|
if cpus is None and memory is None:
|
|
65
67
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
@@ -68,7 +70,8 @@ def get_default_instance_type(
|
|
|
68
70
|
else:
|
|
69
71
|
memory_gb_or_ratio = memory
|
|
70
72
|
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
|
71
|
-
memory_gb_or_ratio
|
|
73
|
+
memory_gb_or_ratio,
|
|
74
|
+
region, zone)
|
|
72
75
|
|
|
73
76
|
|
|
74
77
|
def get_accelerators_from_instance_type(
|
|
@@ -6,7 +6,7 @@ instance types and pricing information for Nebius.
|
|
|
6
6
|
import typing
|
|
7
7
|
from typing import Dict, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from sky.
|
|
9
|
+
from sky.catalog import common
|
|
10
10
|
from sky.utils import resources_utils
|
|
11
11
|
from sky.utils import ux_utils
|
|
12
12
|
|
|
@@ -38,7 +38,6 @@ def get_hourly_cost(instance_type: str,
|
|
|
38
38
|
region: Optional[str] = None,
|
|
39
39
|
zone: Optional[str] = None) -> float:
|
|
40
40
|
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
41
|
-
assert not use_spot, 'Nebius does not support spot.'
|
|
42
41
|
if zone is not None:
|
|
43
42
|
with ux_utils.print_exception_no_traceback():
|
|
44
43
|
raise ValueError('Nebius does not support zones.')
|
|
@@ -51,12 +50,15 @@ def get_vcpus_mem_from_instance_type(
|
|
|
51
50
|
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
52
51
|
|
|
53
52
|
|
|
54
|
-
def get_default_instance_type(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
53
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
54
|
+
memory: Optional[str] = None,
|
|
55
|
+
disk_tier: Optional[
|
|
56
|
+
resources_utils.DiskTier] = None,
|
|
57
|
+
region: Optional[str] = None,
|
|
58
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
58
59
|
del disk_tier # unused
|
|
59
|
-
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory
|
|
60
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
61
|
+
zone)
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
def get_accelerators_from_instance_type(
|
|
@@ -17,8 +17,8 @@ import typing
|
|
|
17
17
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
18
|
|
|
19
19
|
from sky.adaptors import oci as oci_adaptor
|
|
20
|
+
from sky.catalog import common
|
|
20
21
|
from sky.clouds import OCI
|
|
21
|
-
from sky.clouds.service_catalog import common
|
|
22
22
|
from sky.clouds.utils import oci_utils
|
|
23
23
|
from sky.utils import resources_utils
|
|
24
24
|
|
|
@@ -101,10 +101,12 @@ def get_hourly_cost(instance_type: str,
|
|
|
101
101
|
region, zone)
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
def get_default_instance_type(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
104
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
105
|
+
memory: Optional[str] = None,
|
|
106
|
+
disk_tier: Optional[
|
|
107
|
+
resources_utils.DiskTier] = None,
|
|
108
|
+
region: Optional[str] = None,
|
|
109
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
108
110
|
if cpus is None:
|
|
109
111
|
cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+'
|
|
110
112
|
|
|
@@ -127,7 +129,8 @@ def get_default_instance_type(
|
|
|
127
129
|
|
|
128
130
|
logger.debug(f'# get_default_instance_type: {df}')
|
|
129
131
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
130
|
-
memory_gb_or_ratio
|
|
132
|
+
memory_gb_or_ratio,
|
|
133
|
+
region, zone)
|
|
131
134
|
|
|
132
135
|
|
|
133
136
|
def get_accelerators_from_instance_type(
|
|
@@ -7,7 +7,7 @@ query instance types and pricing information for Paperspace.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
-
from sky.
|
|
10
|
+
from sky.catalog import common
|
|
11
11
|
from sky.utils import ux_utils
|
|
12
12
|
|
|
13
13
|
if typing.TYPE_CHECKING:
|
|
@@ -52,11 +52,14 @@ def get_default_instance_type(
|
|
|
52
52
|
cpus: Optional[str] = None,
|
|
53
53
|
memory: Optional[str] = None,
|
|
54
54
|
disk_tier: Optional[str] = None,
|
|
55
|
+
region: Optional[str] = None,
|
|
56
|
+
zone: Optional[str] = None,
|
|
55
57
|
) -> Optional[str]:
|
|
56
58
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
57
59
|
# want to specify a default instance type or family.
|
|
58
60
|
del disk_tier # unused
|
|
59
|
-
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory
|
|
61
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
62
|
+
zone)
|
|
60
63
|
|
|
61
64
|
|
|
62
65
|
def get_accelerators_from_instance_type(
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""PrimeIntellect service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads the service catalog file and can be used to
|
|
4
|
+
query instance types and pricing information for PrimeIntellect.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
from sky.catalog import common
|
|
11
|
+
|
|
12
|
+
if typing.TYPE_CHECKING:
|
|
13
|
+
from sky.clouds import cloud
|
|
14
|
+
|
|
15
|
+
_df = common.read_catalog('primeintellect/vms.csv')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
19
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_region_zone(
|
|
23
|
+
region: Optional[str],
|
|
24
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
25
|
+
return common.validate_region_zone_impl('primeintellect', _df, region, zone)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_hourly_cost(instance_type: str,
|
|
29
|
+
use_spot: bool = False,
|
|
30
|
+
region: Optional[str] = None,
|
|
31
|
+
zone: Optional[str] = None) -> float:
|
|
32
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
33
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
34
|
+
zone)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_vcpus_mem_from_instance_type(
|
|
38
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
39
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
43
|
+
memory: Optional[str] = None,
|
|
44
|
+
disk_tier: Optional[str] = None,
|
|
45
|
+
region: Optional[str] = None,
|
|
46
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
47
|
+
del disk_tier # no disk tiers
|
|
48
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
49
|
+
zone)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_accelerators_from_instance_type(
|
|
53
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
54
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_type_for_accelerator(
|
|
58
|
+
acc_name: str,
|
|
59
|
+
acc_count: int,
|
|
60
|
+
cpus: Optional[str] = None,
|
|
61
|
+
memory: Optional[str] = None,
|
|
62
|
+
use_spot: bool = False,
|
|
63
|
+
region: Optional[str] = None,
|
|
64
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
65
|
+
"""Returns a list of instance types that have the given accelerator."""
|
|
66
|
+
return common.get_instance_type_for_accelerator_impl(df=_df,
|
|
67
|
+
acc_name=acc_name,
|
|
68
|
+
acc_count=acc_count,
|
|
69
|
+
cpus=cpus,
|
|
70
|
+
memory=memory,
|
|
71
|
+
use_spot=use_spot,
|
|
72
|
+
region=region,
|
|
73
|
+
zone=zone)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
77
|
+
use_spot: bool) -> List['cloud.Region']:
|
|
78
|
+
df = _df[_df['InstanceType'] == instance_type]
|
|
79
|
+
return common.get_region_zones(df, use_spot)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def list_accelerators(
|
|
83
|
+
gpus_only: bool,
|
|
84
|
+
name_filter: Optional[str],
|
|
85
|
+
region_filter: Optional[str],
|
|
86
|
+
quantity_filter: Optional[int],
|
|
87
|
+
case_sensitive: bool = True,
|
|
88
|
+
all_regions: bool = False,
|
|
89
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
90
|
+
"""Returns all instance types in Prime Intellect offering GPUs."""
|
|
91
|
+
del require_price
|
|
92
|
+
return common.list_accelerators_impl('PrimeIntellect', _df, gpus_only,
|
|
93
|
+
name_filter, region_filter,
|
|
94
|
+
quantity_filter, case_sensitive,
|
|
95
|
+
all_regions)
|
|
@@ -7,12 +7,16 @@ query instance types and pricing information for RunPod.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
-
from sky.
|
|
10
|
+
from sky.catalog import common
|
|
11
11
|
|
|
12
12
|
if typing.TYPE_CHECKING:
|
|
13
13
|
from sky.clouds import cloud
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# Runpod has no set updated schedule for their catalog. We pull the catalog
|
|
16
|
+
# every 7 hours to make sure we have the latest information.
|
|
17
|
+
_PULL_FREQUENCY_HOURS = 7
|
|
18
|
+
_df = common.read_catalog('runpod/vms.csv',
|
|
19
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
def instance_type_exists(instance_type: str) -> bool:
|
|
@@ -41,11 +45,14 @@ def get_vcpus_mem_from_instance_type(
|
|
|
41
45
|
|
|
42
46
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
43
47
|
memory: Optional[str] = None,
|
|
44
|
-
disk_tier: Optional[str] = None
|
|
48
|
+
disk_tier: Optional[str] = None,
|
|
49
|
+
region: Optional[str] = None,
|
|
50
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
45
51
|
del disk_tier # RunPod does not support disk tiers.
|
|
46
52
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
47
53
|
# want to specify a default instance type or family.
|
|
48
|
-
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory
|
|
54
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
55
|
+
zone)
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
def get_accelerators_from_instance_type(
|
|
@@ -7,7 +7,7 @@ instance types and pricing information for SCP.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
-
from sky.
|
|
10
|
+
from sky.catalog import common
|
|
11
11
|
from sky.utils import resources_utils
|
|
12
12
|
from sky.utils import ux_utils
|
|
13
13
|
|
|
@@ -51,10 +51,12 @@ def get_vcpus_mem_from_instance_type(
|
|
|
51
51
|
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
def get_default_instance_type(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
54
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
55
|
+
memory: Optional[str] = None,
|
|
56
|
+
disk_tier: Optional[
|
|
57
|
+
resources_utils.DiskTier] = None,
|
|
58
|
+
region: Optional[str] = None,
|
|
59
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
58
60
|
del disk_tier # unused
|
|
59
61
|
if cpus is None and memory is None:
|
|
60
62
|
cpus = str(_DEFAULT_NUM_VCPUS)
|
|
@@ -63,7 +65,8 @@ def get_default_instance_type(
|
|
|
63
65
|
else:
|
|
64
66
|
memory_gb_or_ratio = memory
|
|
65
67
|
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
|
66
|
-
memory_gb_or_ratio
|
|
68
|
+
memory_gb_or_ratio,
|
|
69
|
+
region, zone)
|
|
67
70
|
|
|
68
71
|
|
|
69
72
|
def get_accelerators_from_instance_type(
|