skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -46,7 +46,7 @@ ALL_REGIONS = [
|
|
|
46
46
|
'eu-west-1',
|
|
47
47
|
'eu-west-2',
|
|
48
48
|
'eu-south-1',
|
|
49
|
-
|
|
49
|
+
'eu-south-2',
|
|
50
50
|
'eu-west-3',
|
|
51
51
|
'eu-north-1',
|
|
52
52
|
'me-south-1',
|
|
@@ -67,17 +67,17 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
|
67
67
|
# The following columns will be included in the final catalog.
|
|
68
68
|
USEFUL_COLUMNS = [
|
|
69
69
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
70
|
-
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
|
|
70
|
+
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
|
|
74
74
|
# only available in this region, but it serves pricing information for all
|
|
75
75
|
# regions.
|
|
76
76
|
PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
|
|
77
|
-
#
|
|
78
|
-
# the
|
|
79
|
-
#
|
|
80
|
-
|
|
77
|
+
# g6f instances have fractional GPUs, but the API returns Count: 1 under
|
|
78
|
+
# GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
|
|
79
|
+
# divided by the total memory of an L4 will give us the fraction of the GPU.
|
|
80
|
+
L4_GPU_MEMORY = 22888
|
|
81
81
|
|
|
82
82
|
regions_enabled: Optional[Set[str]] = None
|
|
83
83
|
|
|
@@ -210,35 +210,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
|
|
|
210
210
|
return df
|
|
211
211
|
|
|
212
212
|
|
|
213
|
-
def _patch_p4de(region: str, df: 'pd.DataFrame',
|
|
214
|
-
pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
|
|
215
|
-
# Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
|
|
216
|
-
# to the instance type.
|
|
217
|
-
# Columns:
|
|
218
|
-
# InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
|
|
219
|
-
# Price,SpotPrice,Region,AvailabilityZone
|
|
220
|
-
records = []
|
|
221
|
-
for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
|
|
222
|
-
records.append({
|
|
223
|
-
'InstanceType': 'p4de.24xlarge',
|
|
224
|
-
'AcceleratorName': 'A100-80GB',
|
|
225
|
-
'AcceleratorCount': 8,
|
|
226
|
-
'vCPUs': 96,
|
|
227
|
-
'MemoryGiB': 1152,
|
|
228
|
-
'GpuInfo':
|
|
229
|
-
('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
|
|
230
|
-
'\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
|
|
231
|
-
'81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
|
|
232
|
-
'AvailabilityZone': zone,
|
|
233
|
-
'Region': region,
|
|
234
|
-
'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
|
|
235
|
-
['Price'].values[0],
|
|
236
|
-
'SpotPrice': np.nan,
|
|
237
|
-
})
|
|
238
|
-
df = pd.concat([df, pd.DataFrame.from_records(records)])
|
|
239
|
-
return df
|
|
240
|
-
|
|
241
|
-
|
|
242
213
|
def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
243
214
|
try:
|
|
244
215
|
# Fetch the zone info first to make sure the account has access to the
|
|
@@ -271,13 +242,24 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
271
242
|
return None, np.nan
|
|
272
243
|
return accelerator['Name'], accelerator['Count']
|
|
273
244
|
|
|
245
|
+
def get_arch(row) -> Optional[str]:
|
|
246
|
+
if 'ProcessorInfo' in row:
|
|
247
|
+
processor = row['ProcessorInfo']
|
|
248
|
+
if 'SupportedArchitectures' in processor:
|
|
249
|
+
archs = processor['SupportedArchitectures']
|
|
250
|
+
if isinstance(archs, list):
|
|
251
|
+
return archs[0]
|
|
252
|
+
elif isinstance(archs, str):
|
|
253
|
+
return archs
|
|
254
|
+
return None
|
|
255
|
+
|
|
274
256
|
def get_vcpus(row) -> float:
|
|
275
257
|
if not np.isnan(row['vCPU']):
|
|
276
258
|
return float(row['vCPU'])
|
|
277
259
|
try:
|
|
278
260
|
return float(row['VCpuInfo']['DefaultVCpus'])
|
|
279
261
|
except Exception as e: # pylint: disable=broad-except
|
|
280
|
-
print('Error
|
|
262
|
+
print('Error occurred for row:', row)
|
|
281
263
|
print('Error:', e)
|
|
282
264
|
raise
|
|
283
265
|
|
|
@@ -313,11 +295,22 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
313
295
|
# AWS API is 'NVIDIA', which is incorrect. See #4652.
|
|
314
296
|
acc_name = 'H200'
|
|
315
297
|
acc_count = 8
|
|
298
|
+
if (row['InstanceType'].startswith('g6f') or
|
|
299
|
+
row['InstanceType'].startswith('gr6f')):
|
|
300
|
+
# These instance actually have only fractional GPUs, but the API
|
|
301
|
+
# returns Count: 1 under GpuInfo. We need to check the GPU
|
|
302
|
+
# memory to get the actual fraction of the GPU.
|
|
303
|
+
# See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
|
|
304
|
+
fraction = row['GpuInfo']['TotalGpuMemoryInMiB'] / L4_GPU_MEMORY
|
|
305
|
+
acc_count = round(fraction, 3)
|
|
306
|
+
if row['InstanceType'] == 'p5.4xlarge':
|
|
307
|
+
acc_count = 1
|
|
316
308
|
return pd.Series({
|
|
317
309
|
'AcceleratorName': acc_name,
|
|
318
310
|
'AcceleratorCount': acc_count,
|
|
319
311
|
'vCPUs': get_vcpus(row),
|
|
320
312
|
'MemoryGiB': get_memory_gib(row),
|
|
313
|
+
'Arch': get_arch(row),
|
|
321
314
|
})
|
|
322
315
|
|
|
323
316
|
# The AWS API may not have all the instance types in the pricing table,
|
|
@@ -341,9 +334,6 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
341
334
|
df = pd.concat(
|
|
342
335
|
[df, df.apply(get_additional_columns, axis='columns')],
|
|
343
336
|
axis='columns')
|
|
344
|
-
# patch the df for p4de.24xlarge
|
|
345
|
-
if region in P4DE_REGIONS:
|
|
346
|
-
df = _patch_p4de(region, df, pricing_df)
|
|
347
337
|
if 'GpuInfo' not in df.columns:
|
|
348
338
|
df['GpuInfo'] = np.nan
|
|
349
339
|
df = df[USEFUL_COLUMNS]
|
|
@@ -546,7 +536,7 @@ if __name__ == '__main__':
|
|
|
546
536
|
print('AWS Service Catalog saved to aws/vms.csv')
|
|
547
537
|
|
|
548
538
|
# Disable refreshing images.csv as we are using skypilot custom AMIs
|
|
549
|
-
# See sky/clouds/
|
|
539
|
+
# See sky/clouds/catalog/images/README.md for more details.
|
|
550
540
|
# image_df = get_all_regions_images_df(user_regions)
|
|
551
541
|
# _check_regions_integrity(image_df, 'images')
|
|
552
542
|
|
|
@@ -9,18 +9,11 @@ import os
|
|
|
9
9
|
|
|
10
10
|
import cudo_compute
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
from sky.provision.cudo import cudo_utils as utils
|
|
13
13
|
|
|
14
14
|
VMS_CSV = 'cudo/vms.csv'
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def cudo_api():
|
|
18
|
-
configuration = cudo_compute.Configuration()
|
|
19
|
-
configuration.host = 'https://rest.compute.cudo.org'
|
|
20
|
-
client = cudo_compute.ApiClient(configuration)
|
|
21
|
-
return cudo_compute.VirtualMachinesApi(client)
|
|
22
|
-
|
|
23
|
-
|
|
24
17
|
def get_gpu_info(count, model):
|
|
25
18
|
mem = utils.cudo_gpu_mem[model]
|
|
26
19
|
# pylint: disable=line-too-long
|
|
@@ -45,39 +38,46 @@ def get_instance_type(machine_type, vcpu, mem, gpu):
|
|
|
45
38
|
mem) + 'gb'
|
|
46
39
|
|
|
47
40
|
|
|
48
|
-
def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
|
|
49
|
-
try:
|
|
50
|
-
api = cudo_api()
|
|
51
|
-
types = api.list_vm_machine_types(mem_gib,
|
|
52
|
-
vcpu_count,
|
|
53
|
-
gpu=gpu_count,
|
|
54
|
-
gpu_model=gpu_model)
|
|
55
|
-
return types.to_dict()
|
|
56
|
-
except cudo_compute.rest.ApiException as e:
|
|
57
|
-
raise e
|
|
58
|
-
|
|
59
|
-
|
|
60
41
|
def update_prices():
|
|
61
42
|
rows = []
|
|
43
|
+
|
|
44
|
+
api = cudo_compute.cudo_api.virtual_machines()
|
|
45
|
+
all_types = api.list_vm_machine_types2()
|
|
46
|
+
all_machine_types = all_types.to_dict()['machine_types']
|
|
47
|
+
|
|
62
48
|
for spec in utils.machine_specs:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
49
|
+
for machine_type in all_machine_types:
|
|
50
|
+
if (machine_type['min_vcpu'] <= spec['vcpu'] and
|
|
51
|
+
machine_type['min_memory_gib'] <= spec['mem'] and
|
|
52
|
+
utils.gpu_exists(machine_type['gpu_model'])):
|
|
53
|
+
|
|
54
|
+
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(
|
|
55
|
+
machine_type['gpu_model'])
|
|
56
|
+
|
|
57
|
+
# Calculate total price per hour based on the given spec
|
|
58
|
+
vcpu_price = float(
|
|
59
|
+
machine_type['vcpu_price_hr']['value']) * spec['vcpu']
|
|
60
|
+
memory_price = float(
|
|
61
|
+
machine_type['memory_gib_price_hr']['value']) * spec['mem']
|
|
62
|
+
gpu_price = float(
|
|
63
|
+
machine_type['gpu_price_hr']['value']) * spec['gpu']
|
|
64
|
+
# Note: Not including storage and IPv4 prices
|
|
65
|
+
# for now as they may be optional
|
|
66
|
+
total_price = vcpu_price + memory_price + gpu_price
|
|
67
|
+
|
|
68
|
+
row = {
|
|
69
|
+
'instance_type': get_instance_type(
|
|
70
|
+
machine_type['machine_type'], spec['vcpu'], spec['mem'],
|
|
71
|
+
spec['gpu']),
|
|
72
|
+
'accelerator_name': accelerator_name,
|
|
73
|
+
'accelerator_count': str(spec['gpu']) + '.0',
|
|
74
|
+
'vcpus': str(spec['vcpu']),
|
|
75
|
+
'memory_gib': str(spec['mem']),
|
|
76
|
+
'price': str(total_price),
|
|
77
|
+
'region': machine_type['data_center_id'],
|
|
78
|
+
'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
|
|
79
|
+
}
|
|
80
|
+
rows.append(row)
|
|
81
81
|
path = VMS_CSV
|
|
82
82
|
with open(path, 'w', encoding='utf-8') as file:
|
|
83
83
|
file.write(
|
|
@@ -179,9 +179,12 @@ TPU_V4_HOST_DF = pd.read_csv(
|
|
|
179
179
|
# TODO(woosuk): Make this more robust.
|
|
180
180
|
# Refer to: https://github.com/skypilot-org/skypilot/issues/1006
|
|
181
181
|
# Unsupported Series: 'f1', 'm2'
|
|
182
|
-
|
|
182
|
+
SERIES_TO_DESCRIPTION = {
|
|
183
183
|
'a2': 'A2 Instance',
|
|
184
184
|
'a3': 'A3 Instance',
|
|
185
|
+
# TODO(zhwu): GCP does not have A4 instance in SKUs API yet. We keep it here
|
|
186
|
+
# for completeness.
|
|
187
|
+
'a4': 'A4 Instance',
|
|
185
188
|
'c2': 'Compute optimized',
|
|
186
189
|
'c2d': 'C2D AMD Instance',
|
|
187
190
|
'c3': 'C3 Instance',
|
|
@@ -195,9 +198,11 @@ SERIES_TO_DISCRIPTION = {
|
|
|
195
198
|
'n1': 'N1 Predefined Instance',
|
|
196
199
|
'n2': 'N2 Instance',
|
|
197
200
|
'n2d': 'N2D AMD Instance',
|
|
201
|
+
'n4': 'N4 Instance',
|
|
198
202
|
't2a': 'T2A Arm Instance',
|
|
199
203
|
't2d': 'T2D AMD Instance',
|
|
200
204
|
}
|
|
205
|
+
|
|
201
206
|
creds, project_id = google.auth.default()
|
|
202
207
|
gcp_client = discovery.build('compute', 'v1')
|
|
203
208
|
tpu_client = discovery.build('tpu', 'v1')
|
|
@@ -334,7 +339,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
334
339
|
|
|
335
340
|
# Drop the unsupported series.
|
|
336
341
|
df = df[df['InstanceType'].str.startswith(
|
|
337
|
-
tuple(f'{series}-' for series in
|
|
342
|
+
tuple(f'{series}-' for series in SERIES_TO_DESCRIPTION))]
|
|
338
343
|
df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
|
|
339
344
|
|
|
340
345
|
# TODO(woosuk): Make this more efficient.
|
|
@@ -352,7 +357,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
352
357
|
|
|
353
358
|
# Check if the SKU is for the correct series.
|
|
354
359
|
description = sku['description']
|
|
355
|
-
if
|
|
360
|
+
if SERIES_TO_DESCRIPTION[series].lower() not in description.lower():
|
|
356
361
|
continue
|
|
357
362
|
# Special check for M1 instances.
|
|
358
363
|
if series == 'm1' and 'M3' in description:
|
|
@@ -434,10 +439,18 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
|
|
|
434
439
|
gpu_name = gpu_name.upper()
|
|
435
440
|
if 'H100-80GB' in gpu_name:
|
|
436
441
|
gpu_name = 'H100'
|
|
437
|
-
|
|
442
|
+
|
|
443
|
+
if 'H100-MEGA' in gpu_name:
|
|
438
444
|
gpu_name = 'H100-MEGA'
|
|
439
445
|
if count != 8:
|
|
440
|
-
|
|
446
|
+
continue
|
|
447
|
+
elif 'H200' in gpu_name:
|
|
448
|
+
gpu_name = 'H200'
|
|
449
|
+
if count != 8:
|
|
450
|
+
continue
|
|
451
|
+
elif 'B200' in gpu_name:
|
|
452
|
+
gpu_name = 'B200'
|
|
453
|
+
if count != 8:
|
|
441
454
|
continue
|
|
442
455
|
if 'VWS' in gpu_name:
|
|
443
456
|
continue
|
|
@@ -468,6 +481,8 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
|
|
|
468
481
|
'A100': 40 * 1024,
|
|
469
482
|
'H100': 80 * 1024,
|
|
470
483
|
'H100-MEGA': 80 * 1024,
|
|
484
|
+
'H200': 141 * 1024,
|
|
485
|
+
'B200': 180 * 1024,
|
|
471
486
|
'P4': 8 * 1024,
|
|
472
487
|
'T4': 16 * 1024,
|
|
473
488
|
'V100': 16 * 1024,
|
|
@@ -507,22 +522,30 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
|
507
522
|
ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
|
|
508
523
|
gpu_price = None
|
|
509
524
|
for sku in gpu_skus:
|
|
525
|
+
row_gpu_name = row['AcceleratorName']
|
|
510
526
|
if row['Region'] not in sku['serviceRegions']:
|
|
511
527
|
continue
|
|
512
528
|
if sku['category']['usageType'] != ondemand_or_spot:
|
|
513
529
|
continue
|
|
514
530
|
|
|
515
|
-
gpu_names = [
|
|
516
|
-
if
|
|
517
|
-
gpu_names = ['A100 80GB']
|
|
518
|
-
|
|
519
|
-
gpu_names = ['H100 80GB']
|
|
520
|
-
|
|
531
|
+
gpu_names = [f'{row_gpu_name} GPU']
|
|
532
|
+
if row_gpu_name == 'A100-80GB':
|
|
533
|
+
gpu_names = ['A100 80GB GPU']
|
|
534
|
+
elif row_gpu_name == 'H100':
|
|
535
|
+
gpu_names = ['H100 80GB GPU']
|
|
536
|
+
elif row_gpu_name == 'H100-MEGA':
|
|
521
537
|
# Seems that H100-MEGA has two different descriptions in SKUs in
|
|
522
538
|
# different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
|
|
523
|
-
gpu_names = [
|
|
524
|
-
|
|
525
|
-
|
|
539
|
+
gpu_names = [
|
|
540
|
+
'H100 80GB Mega GPU', 'H100 Mega 80GB GPU',
|
|
541
|
+
'H100 80GB Plus GPU'
|
|
542
|
+
]
|
|
543
|
+
elif row_gpu_name == 'H200':
|
|
544
|
+
gpu_names = ['H200 141GB GPU']
|
|
545
|
+
elif row_gpu_name == 'B200':
|
|
546
|
+
gpu_names = ['Nvidia B200 (1 gpu slice)']
|
|
547
|
+
if not any(
|
|
548
|
+
gpu_name in sku['description'] for gpu_name in gpu_names):
|
|
526
549
|
continue
|
|
527
550
|
|
|
528
551
|
unit_price = _get_unit_price(sku)
|
|
@@ -554,7 +577,7 @@ def _get_tpu_response_for_zone(zone: str) -> list:
|
|
|
554
577
|
# Sometimes the response is empty ({}) even for enabled zones. Here we
|
|
555
578
|
# retry the request for a few times.
|
|
556
579
|
backoff = common_utils.Backoff(initial_backoff=1)
|
|
557
|
-
for
|
|
580
|
+
for retry_cnt in range(TPU_RETRY_CNT):
|
|
558
581
|
tpus_request = (
|
|
559
582
|
tpu_client.projects().locations().acceleratorTypes().list(
|
|
560
583
|
parent=parent))
|
|
@@ -570,6 +593,10 @@ def _get_tpu_response_for_zone(zone: str) -> list:
|
|
|
570
593
|
print(f' An error occurred: {error}')
|
|
571
594
|
# If error happens, fail early.
|
|
572
595
|
return []
|
|
596
|
+
except TimeoutError:
|
|
597
|
+
print(f' TimeoutError: Failed to fetch TPUs for zone {zone!r}, '
|
|
598
|
+
f'retry {retry_cnt + 1} of {TPU_RETRY_CNT}')
|
|
599
|
+
|
|
573
600
|
time_to_sleep = backoff.current_backoff()
|
|
574
601
|
print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
|
|
575
602
|
time.sleep(time_to_sleep)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Script to fetch Hyperbolic instance data and generate catalog."""
|
|
2
|
+
import argparse
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
ENDPOINT = 'https://api.hyperbolic.xyz/v2/skypilot/catalog'
|
|
12
|
+
API_KEY_PATH = os.path.expanduser('~/.hyperbolic/api_key')
|
|
13
|
+
|
|
14
|
+
REQUIRED_FIELDS = [
|
|
15
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
16
|
+
'StorageGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HyperbolicCatalogError(Exception):
|
|
21
|
+
"""Base exception for Hyperbolic catalog errors."""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_api_key(api_key=None) -> str:
|
|
26
|
+
"""Get API key from arg, env var, or file."""
|
|
27
|
+
if api_key:
|
|
28
|
+
return api_key
|
|
29
|
+
if api_key := os.environ.get('HYPERBOLIC_API_KEY'):
|
|
30
|
+
return api_key
|
|
31
|
+
try:
|
|
32
|
+
with open(API_KEY_PATH, 'r', encoding='utf-8') as f:
|
|
33
|
+
return f.read().strip()
|
|
34
|
+
except FileNotFoundError as exc:
|
|
35
|
+
raise HyperbolicCatalogError(
|
|
36
|
+
'No API key found. Please either:\n'
|
|
37
|
+
'1. Pass --api-key\n'
|
|
38
|
+
'2. Set HYPERBOLIC_API_KEY environment variable\n'
|
|
39
|
+
'3. Create ~/.hyperbolic/api_key file') from exc
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_output_path() -> str:
|
|
43
|
+
"""Get output path for catalog file."""
|
|
44
|
+
current_dir = os.getcwd()
|
|
45
|
+
if os.path.basename(current_dir) == 'hyperbolic':
|
|
46
|
+
return 'vms.csv'
|
|
47
|
+
hyperbolic_dir = os.path.join(current_dir, 'hyperbolic')
|
|
48
|
+
os.makedirs(hyperbolic_dir, exist_ok=True)
|
|
49
|
+
return os.path.join(hyperbolic_dir, 'vms.csv')
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_instance_data(instance: Dict[str, Any]) -> None:
|
|
53
|
+
"""Validate instance data has all required fields."""
|
|
54
|
+
missing_fields = [
|
|
55
|
+
field for field in REQUIRED_FIELDS if field not in instance
|
|
56
|
+
]
|
|
57
|
+
if missing_fields:
|
|
58
|
+
raise HyperbolicCatalogError(
|
|
59
|
+
f'Instance data missing required fields: {missing_fields}')
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_catalog(api_key=None) -> None:
|
|
63
|
+
"""Generate Hyperbolic catalog CSV file."""
|
|
64
|
+
try:
|
|
65
|
+
response = requests.get(
|
|
66
|
+
ENDPOINT,
|
|
67
|
+
headers={'Authorization': f'Bearer {get_api_key(api_key)}'},
|
|
68
|
+
timeout=30)
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
data = response.json()
|
|
73
|
+
except json.JSONDecodeError as e:
|
|
74
|
+
raise HyperbolicCatalogError(
|
|
75
|
+
f'Invalid JSON response from API: {response.text}') from e
|
|
76
|
+
|
|
77
|
+
if 'vms' not in data:
|
|
78
|
+
raise HyperbolicCatalogError(
|
|
79
|
+
f'Missing "vms" field in API response: {data}')
|
|
80
|
+
|
|
81
|
+
instances = data['vms']
|
|
82
|
+
if not isinstance(instances, list):
|
|
83
|
+
raise HyperbolicCatalogError(
|
|
84
|
+
f'Expected list of instances, got {type(instances)}')
|
|
85
|
+
|
|
86
|
+
if not instances:
|
|
87
|
+
raise HyperbolicCatalogError('No instances found in API response')
|
|
88
|
+
|
|
89
|
+
# Validate each instance
|
|
90
|
+
for instance in instances:
|
|
91
|
+
validate_instance_data(instance)
|
|
92
|
+
|
|
93
|
+
except requests.exceptions.RequestException as e:
|
|
94
|
+
raise HyperbolicCatalogError(
|
|
95
|
+
f'Failed to fetch instance data: {e}') from e
|
|
96
|
+
|
|
97
|
+
output_path = get_output_path()
|
|
98
|
+
try:
|
|
99
|
+
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
100
|
+
writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS)
|
|
101
|
+
writer.writeheader()
|
|
102
|
+
|
|
103
|
+
for instance in instances:
|
|
104
|
+
entry = instance.copy()
|
|
105
|
+
# Convert GpuInfo to string format
|
|
106
|
+
entry['GpuInfo'] = json.dumps(entry['GpuInfo'],
|
|
107
|
+
ensure_ascii=False).replace(
|
|
108
|
+
'"', "'") # pylint: disable=invalid-string-quote
|
|
109
|
+
writer.writerow(entry)
|
|
110
|
+
except (IOError, OSError) as e:
|
|
111
|
+
raise HyperbolicCatalogError(
|
|
112
|
+
f'Failed to write catalog file to {output_path}: {e}') from e
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def main() -> int:
|
|
116
|
+
"""Main entry point."""
|
|
117
|
+
parser = argparse.ArgumentParser(
|
|
118
|
+
description='Fetch Hyperbolic instance data')
|
|
119
|
+
parser.add_argument('--api-key', help='Hyperbolic API key')
|
|
120
|
+
args = parser.parse_args()
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
create_catalog(args.api_key)
|
|
124
|
+
print(f'Hyperbolic Service Catalog saved to {get_output_path()}')
|
|
125
|
+
return 0
|
|
126
|
+
except HyperbolicCatalogError as e:
|
|
127
|
+
print(f'Error: {e}', file=sys.stderr)
|
|
128
|
+
return 1
|
|
129
|
+
except (requests.exceptions.RequestException, json.JSONDecodeError, IOError,
|
|
130
|
+
OSError) as e:
|
|
131
|
+
print(f'Unexpected error: {e}', file=sys.stderr)
|
|
132
|
+
return 1
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == '__main__':
|
|
136
|
+
sys.exit(main())
|