skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -46,7 +46,7 @@ ALL_REGIONS = [
|
|
|
46
46
|
'eu-west-1',
|
|
47
47
|
'eu-west-2',
|
|
48
48
|
'eu-south-1',
|
|
49
|
-
|
|
49
|
+
'eu-south-2',
|
|
50
50
|
'eu-west-3',
|
|
51
51
|
'eu-north-1',
|
|
52
52
|
'me-south-1',
|
|
@@ -60,6 +60,7 @@ ALL_REGIONS = [
|
|
|
60
60
|
'ap-northeast-2',
|
|
61
61
|
'ap-southeast-1',
|
|
62
62
|
'ap-southeast-2',
|
|
63
|
+
'ap-southeast-4',
|
|
63
64
|
'ap-northeast-1',
|
|
64
65
|
]
|
|
65
66
|
US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
@@ -67,17 +68,17 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
|
67
68
|
# The following columns will be included in the final catalog.
|
|
68
69
|
USEFUL_COLUMNS = [
|
|
69
70
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
70
|
-
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
|
|
71
|
+
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
|
|
71
72
|
]
|
|
72
73
|
|
|
73
74
|
# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
|
|
74
75
|
# only available in this region, but it serves pricing information for all
|
|
75
76
|
# regions.
|
|
76
77
|
PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
|
|
77
|
-
#
|
|
78
|
-
# the
|
|
79
|
-
#
|
|
80
|
-
|
|
78
|
+
# g6f instances have fractional GPUs, but the API returns Count: 1 under
|
|
79
|
+
# GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
|
|
80
|
+
# divided by the total memory of an L4 will give us the fraction of the GPU.
|
|
81
|
+
L4_GPU_MEMORY = 22888
|
|
81
82
|
|
|
82
83
|
regions_enabled: Optional[Set[str]] = None
|
|
83
84
|
|
|
@@ -210,35 +211,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
|
|
|
210
211
|
return df
|
|
211
212
|
|
|
212
213
|
|
|
213
|
-
def _patch_p4de(region: str, df: 'pd.DataFrame',
|
|
214
|
-
pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
|
|
215
|
-
# Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
|
|
216
|
-
# to the instance type.
|
|
217
|
-
# Columns:
|
|
218
|
-
# InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
|
|
219
|
-
# Price,SpotPrice,Region,AvailabilityZone
|
|
220
|
-
records = []
|
|
221
|
-
for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
|
|
222
|
-
records.append({
|
|
223
|
-
'InstanceType': 'p4de.24xlarge',
|
|
224
|
-
'AcceleratorName': 'A100-80GB',
|
|
225
|
-
'AcceleratorCount': 8,
|
|
226
|
-
'vCPUs': 96,
|
|
227
|
-
'MemoryGiB': 1152,
|
|
228
|
-
'GpuInfo':
|
|
229
|
-
('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
|
|
230
|
-
'\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
|
|
231
|
-
'81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
|
|
232
|
-
'AvailabilityZone': zone,
|
|
233
|
-
'Region': region,
|
|
234
|
-
'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
|
|
235
|
-
['Price'].values[0],
|
|
236
|
-
'SpotPrice': np.nan,
|
|
237
|
-
})
|
|
238
|
-
df = pd.concat([df, pd.DataFrame.from_records(records)])
|
|
239
|
-
return df
|
|
240
|
-
|
|
241
|
-
|
|
242
214
|
def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
243
215
|
try:
|
|
244
216
|
# Fetch the zone info first to make sure the account has access to the
|
|
@@ -262,7 +234,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
262
234
|
def get_acc_info(row) -> Tuple[Optional[str], float]:
|
|
263
235
|
accelerator = None
|
|
264
236
|
for col, info_key in [('GpuInfo', 'Gpus'),
|
|
265
|
-
('
|
|
237
|
+
('NeuronInfo', 'NeuronDevices'),
|
|
266
238
|
('FpgaInfo', 'Fpgas')]:
|
|
267
239
|
info = row.get(col)
|
|
268
240
|
if isinstance(info, dict):
|
|
@@ -271,13 +243,24 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
271
243
|
return None, np.nan
|
|
272
244
|
return accelerator['Name'], accelerator['Count']
|
|
273
245
|
|
|
246
|
+
def get_arch(row) -> Optional[str]:
|
|
247
|
+
if 'ProcessorInfo' in row:
|
|
248
|
+
processor = row['ProcessorInfo']
|
|
249
|
+
if 'SupportedArchitectures' in processor:
|
|
250
|
+
archs = processor['SupportedArchitectures']
|
|
251
|
+
if isinstance(archs, list):
|
|
252
|
+
return archs[0]
|
|
253
|
+
elif isinstance(archs, str):
|
|
254
|
+
return archs
|
|
255
|
+
return None
|
|
256
|
+
|
|
274
257
|
def get_vcpus(row) -> float:
|
|
275
258
|
if not np.isnan(row['vCPU']):
|
|
276
259
|
return float(row['vCPU'])
|
|
277
260
|
try:
|
|
278
261
|
return float(row['VCpuInfo']['DefaultVCpus'])
|
|
279
262
|
except Exception as e: # pylint: disable=broad-except
|
|
280
|
-
print('Error
|
|
263
|
+
print('Error occurred for row:', row)
|
|
281
264
|
print('Error:', e)
|
|
282
265
|
raise
|
|
283
266
|
|
|
@@ -295,29 +278,33 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
295
278
|
if row['InstanceType'] == 'p4de.24xlarge':
|
|
296
279
|
acc_name = 'A100-80GB'
|
|
297
280
|
acc_count = 8
|
|
298
|
-
if row['InstanceType'].startswith('trn1'):
|
|
299
|
-
# Trainium instances does not have a field for information of
|
|
300
|
-
# the accelerators. We need to infer the accelerator info from
|
|
301
|
-
# the instance type name.
|
|
302
|
-
# aws ec2 describe-instance-types --region us-east-1
|
|
303
|
-
# https://aws.amazon.com/ec2/instance-types/trn1/
|
|
304
|
-
acc_name = 'Trainium'
|
|
305
|
-
find_num_in_name = re.search(r'(\d+)xlarge',
|
|
306
|
-
row['InstanceType'])
|
|
307
|
-
assert find_num_in_name is not None, row['InstanceType']
|
|
308
|
-
num_in_name = find_num_in_name.group(1)
|
|
309
|
-
acc_count = int(num_in_name) // 2
|
|
310
281
|
if row['InstanceType'] == 'p5en.48xlarge':
|
|
311
282
|
# TODO(andyl): Check if this workaround still needed after
|
|
312
283
|
# v0.10.0 released. Currently, the acc_name returned by the
|
|
313
284
|
# AWS API is 'NVIDIA', which is incorrect. See #4652.
|
|
314
285
|
acc_name = 'H200'
|
|
315
286
|
acc_count = 8
|
|
287
|
+
if (row['InstanceType'].startswith('g6f') or
|
|
288
|
+
row['InstanceType'].startswith('gr6f')):
|
|
289
|
+
# These instance actually have only fractional GPUs, but the API
|
|
290
|
+
# returns Count: 1 or Count: 0 under GpuInfo. We need to
|
|
291
|
+
# directly check the GPU memory to get the actual fraction of
|
|
292
|
+
# the GPU. Note that TotalGpuMemoryInMiB seems unreliable here -
|
|
293
|
+
# sometimes it is unexpectedly 0.
|
|
294
|
+
# See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
|
|
295
|
+
assert len(row['GpuInfo']['Gpus']) == 1
|
|
296
|
+
assert row['GpuInfo']['Gpus'][0]['Name'] == 'L4'
|
|
297
|
+
fraction = row['GpuInfo']['Gpus'][0]['MemoryInfo'][
|
|
298
|
+
'SizeInMiB'] / L4_GPU_MEMORY
|
|
299
|
+
acc_count = round(fraction, 3)
|
|
300
|
+
if row['InstanceType'] == 'p5.4xlarge':
|
|
301
|
+
acc_count = 1
|
|
316
302
|
return pd.Series({
|
|
317
303
|
'AcceleratorName': acc_name,
|
|
318
304
|
'AcceleratorCount': acc_count,
|
|
319
305
|
'vCPUs': get_vcpus(row),
|
|
320
306
|
'MemoryGiB': get_memory_gib(row),
|
|
307
|
+
'Arch': get_arch(row),
|
|
321
308
|
})
|
|
322
309
|
|
|
323
310
|
# The AWS API may not have all the instance types in the pricing table,
|
|
@@ -341,11 +328,21 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
341
328
|
df = pd.concat(
|
|
342
329
|
[df, df.apply(get_additional_columns, axis='columns')],
|
|
343
330
|
axis='columns')
|
|
344
|
-
# patch the df for p4de.24xlarge
|
|
345
|
-
if region in P4DE_REGIONS:
|
|
346
|
-
df = _patch_p4de(region, df, pricing_df)
|
|
347
331
|
if 'GpuInfo' not in df.columns:
|
|
348
332
|
df['GpuInfo'] = np.nan
|
|
333
|
+
if 'NeuronInfo' in df.columns:
|
|
334
|
+
# The AWS Neuron API uses 'NeuronDevices' instead of 'Gpus'
|
|
335
|
+
# in its dict; for consistency with GPU handling, rename key.
|
|
336
|
+
def map_neuroninfo(neuroninfo):
|
|
337
|
+
if isinstance(neuroninfo,
|
|
338
|
+
dict) and 'NeuronDevices' in neuroninfo:
|
|
339
|
+
# Rename 'NeuronDevices' to 'Gpus'
|
|
340
|
+
neuroninfo = neuroninfo.copy()
|
|
341
|
+
neuroninfo['Gpus'] = neuroninfo.pop('NeuronDevices')
|
|
342
|
+
return neuroninfo
|
|
343
|
+
|
|
344
|
+
df['NeuronInfo'] = df['NeuronInfo'].apply(map_neuroninfo)
|
|
345
|
+
df['GpuInfo'] = df['GpuInfo'].fillna(df['NeuronInfo'])
|
|
349
346
|
df = df[USEFUL_COLUMNS]
|
|
350
347
|
except Exception as e: # pylint: disable=broad-except
|
|
351
348
|
print(traceback.format_exc())
|
|
@@ -393,44 +390,70 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
|
|
|
393
390
|
# TODO(tian): find out the driver version.
|
|
394
391
|
# Neuron driver:
|
|
395
392
|
_GPU_DESC_UBUNTU_DATE = [
|
|
396
|
-
('
|
|
397
|
-
('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
|
|
398
|
-
('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
|
|
399
|
-
('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
|
|
400
|
-
('neuron', 'Base Neuron AMI', '22.04', '20240923'),
|
|
393
|
+
('neuron', '/aws/service/neuron/dlami/multi-framework', '22.04'),
|
|
401
394
|
]
|
|
402
395
|
|
|
403
396
|
|
|
404
|
-
def
|
|
405
|
-
|
|
397
|
+
def _fetch_image_creation_date(region: str,
|
|
398
|
+
image_id: Optional[str]) -> Optional[str]:
|
|
399
|
+
if image_id is None:
|
|
400
|
+
return None
|
|
406
401
|
try:
|
|
407
402
|
image = subprocess.check_output(f"""\
|
|
408
|
-
aws ec2 describe-images --region {region} --
|
|
409
|
-
--
|
|
410
|
-
'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
|
|
403
|
+
aws ec2 describe-images --region {region} --image-ids {image_id} \\
|
|
404
|
+
--query 'Images[0].Name' --output text
|
|
411
405
|
""",
|
|
412
406
|
shell=True)
|
|
413
407
|
except subprocess.CalledProcessError as e:
|
|
414
|
-
print(f'Failed {region}, {
|
|
415
|
-
f'{creation_date}. Trying next date.')
|
|
408
|
+
print(f'Failed to fetch image creation date for {region}, {image_id}')
|
|
416
409
|
print(f'{type(e)}: {e}')
|
|
417
410
|
image_id = None
|
|
411
|
+
else:
|
|
412
|
+
assert image is not None
|
|
413
|
+
image_name = image.decode('utf-8').strip()
|
|
414
|
+
match = re.search(r'(\d+)$', image_name)
|
|
415
|
+
if match:
|
|
416
|
+
return match.group(1)
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _fetch_image_id_from_ssm_param(
|
|
421
|
+
region: str,
|
|
422
|
+
ssm_prefix: str,
|
|
423
|
+
ubuntu_version: str = '22.04') -> Optional[str]:
|
|
424
|
+
try:
|
|
425
|
+
image = subprocess.check_output(f"""\
|
|
426
|
+
aws ssm get-parameter --region {region} --name "{ssm_prefix}/ubuntu-{ubuntu_version}/latest/image_id" \\
|
|
427
|
+
--query 'Parameter.Value' --output text
|
|
428
|
+
""",
|
|
429
|
+
shell=True)
|
|
430
|
+
except subprocess.CalledProcessError as e:
|
|
431
|
+
print(
|
|
432
|
+
f'Failed to fetch image ID from SSM parameter for {region}, {ssm_prefix}, {ubuntu_version}'
|
|
433
|
+
)
|
|
434
|
+
print(f'{type(e)}: {e}')
|
|
435
|
+
return None
|
|
418
436
|
else:
|
|
419
437
|
assert image is not None
|
|
420
438
|
image_id = image.decode('utf-8').strip()
|
|
421
439
|
return image_id
|
|
422
440
|
|
|
423
441
|
|
|
424
|
-
def _get_image_row(
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
442
|
+
def _get_image_row(
|
|
443
|
+
region: str,
|
|
444
|
+
gpu: str,
|
|
445
|
+
ssm_prefix: str,
|
|
446
|
+
ubuntu_version: str = '22.04'
|
|
447
|
+
) -> Tuple[str, str, str, str, Optional[str], Optional[str]]:
|
|
448
|
+
print(f'Getting image for {region}, {ssm_prefix}, {ubuntu_version}, {gpu}')
|
|
449
|
+
image_id = _fetch_image_id_from_ssm_param(region, ssm_prefix,
|
|
450
|
+
ubuntu_version)
|
|
451
|
+
if image_id is not None:
|
|
452
|
+
creation_date = _fetch_image_creation_date(region, image_id)
|
|
453
|
+
else:
|
|
454
|
+
creation_date = None
|
|
432
455
|
tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
|
|
433
|
-
return tag, region, 'ubuntu', ubuntu_version, image_id,
|
|
456
|
+
return tag, region, 'ubuntu', ubuntu_version, image_id, creation_date
|
|
434
457
|
|
|
435
458
|
|
|
436
459
|
def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
|
|
@@ -545,13 +568,26 @@ if __name__ == '__main__':
|
|
|
545
568
|
instance_df.to_csv('aws/vms.csv', index=False)
|
|
546
569
|
print('AWS Service Catalog saved to aws/vms.csv')
|
|
547
570
|
|
|
548
|
-
# Disable refreshing images.csv
|
|
549
|
-
#
|
|
550
|
-
#
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
#
|
|
554
|
-
|
|
571
|
+
# Disable refreshing images.csv for skypilot custom AMIs
|
|
572
|
+
# refresh only the neuron based images
|
|
573
|
+
# See sky/clouds/catalog/images/README.md for more details.
|
|
574
|
+
image_df = get_all_regions_images_df(user_regions)
|
|
575
|
+
_check_regions_integrity(image_df, 'images')
|
|
576
|
+
# filter out rows where ImageId is None
|
|
577
|
+
image_df = image_df[image_df['ImageId'].notna()]
|
|
578
|
+
|
|
579
|
+
# check if aws/images.csv exists
|
|
580
|
+
if os.path.exists('aws/images.csv'):
|
|
581
|
+
# load the data from aws/images.csv
|
|
582
|
+
existing_image_df = pd.read_csv('aws/images.csv')
|
|
583
|
+
# filter out the neuron based images
|
|
584
|
+
existing_image_df = existing_image_df[~existing_image_df['Tag'].
|
|
585
|
+
eq('skypilot:neuron-ubuntu-2204')]
|
|
586
|
+
# concat the new neuron based images with the existing images
|
|
587
|
+
image_df = pd.concat([existing_image_df, image_df])
|
|
588
|
+
|
|
589
|
+
image_df.to_csv('aws/images.csv', index=False)
|
|
590
|
+
print('AWS Images saved to aws/images.csv')
|
|
555
591
|
|
|
556
592
|
if args.az_mappings:
|
|
557
593
|
az_mappings_df = fetch_availability_zone_mappings()
|
|
@@ -9,18 +9,11 @@ import os
|
|
|
9
9
|
|
|
10
10
|
import cudo_compute
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
from sky.provision.cudo import cudo_utils as utils
|
|
13
13
|
|
|
14
14
|
VMS_CSV = 'cudo/vms.csv'
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def cudo_api():
|
|
18
|
-
configuration = cudo_compute.Configuration()
|
|
19
|
-
configuration.host = 'https://rest.compute.cudo.org'
|
|
20
|
-
client = cudo_compute.ApiClient(configuration)
|
|
21
|
-
return cudo_compute.VirtualMachinesApi(client)
|
|
22
|
-
|
|
23
|
-
|
|
24
17
|
def get_gpu_info(count, model):
|
|
25
18
|
mem = utils.cudo_gpu_mem[model]
|
|
26
19
|
# pylint: disable=line-too-long
|
|
@@ -45,39 +38,46 @@ def get_instance_type(machine_type, vcpu, mem, gpu):
|
|
|
45
38
|
mem) + 'gb'
|
|
46
39
|
|
|
47
40
|
|
|
48
|
-
def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
|
|
49
|
-
try:
|
|
50
|
-
api = cudo_api()
|
|
51
|
-
types = api.list_vm_machine_types(mem_gib,
|
|
52
|
-
vcpu_count,
|
|
53
|
-
gpu=gpu_count,
|
|
54
|
-
gpu_model=gpu_model)
|
|
55
|
-
return types.to_dict()
|
|
56
|
-
except cudo_compute.rest.ApiException as e:
|
|
57
|
-
raise e
|
|
58
|
-
|
|
59
|
-
|
|
60
41
|
def update_prices():
|
|
61
42
|
rows = []
|
|
43
|
+
|
|
44
|
+
api = cudo_compute.cudo_api.virtual_machines()
|
|
45
|
+
all_types = api.list_vm_machine_types2()
|
|
46
|
+
all_machine_types = all_types.to_dict()['machine_types']
|
|
47
|
+
|
|
62
48
|
for spec in utils.machine_specs:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
49
|
+
for machine_type in all_machine_types:
|
|
50
|
+
if (machine_type['min_vcpu'] <= spec['vcpu'] and
|
|
51
|
+
machine_type['min_memory_gib'] <= spec['mem'] and
|
|
52
|
+
utils.gpu_exists(machine_type['gpu_model'])):
|
|
53
|
+
|
|
54
|
+
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(
|
|
55
|
+
machine_type['gpu_model'])
|
|
56
|
+
|
|
57
|
+
# Calculate total price per hour based on the given spec
|
|
58
|
+
vcpu_price = float(
|
|
59
|
+
machine_type['vcpu_price_hr']['value']) * spec['vcpu']
|
|
60
|
+
memory_price = float(
|
|
61
|
+
machine_type['memory_gib_price_hr']['value']) * spec['mem']
|
|
62
|
+
gpu_price = float(
|
|
63
|
+
machine_type['gpu_price_hr']['value']) * spec['gpu']
|
|
64
|
+
# Note: Not including storage and IPv4 prices
|
|
65
|
+
# for now as they may be optional
|
|
66
|
+
total_price = vcpu_price + memory_price + gpu_price
|
|
67
|
+
|
|
68
|
+
row = {
|
|
69
|
+
'instance_type': get_instance_type(
|
|
70
|
+
machine_type['machine_type'], spec['vcpu'], spec['mem'],
|
|
71
|
+
spec['gpu']),
|
|
72
|
+
'accelerator_name': accelerator_name,
|
|
73
|
+
'accelerator_count': str(spec['gpu']) + '.0',
|
|
74
|
+
'vcpus': str(spec['vcpu']),
|
|
75
|
+
'memory_gib': str(spec['mem']),
|
|
76
|
+
'price': str(total_price),
|
|
77
|
+
'region': machine_type['data_center_id'],
|
|
78
|
+
'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
|
|
79
|
+
}
|
|
80
|
+
rows.append(row)
|
|
81
81
|
path = VMS_CSV
|
|
82
82
|
with open(path, 'w', encoding='utf-8') as file:
|
|
83
83
|
file.write(
|
|
@@ -179,9 +179,13 @@ TPU_V4_HOST_DF = pd.read_csv(
|
|
|
179
179
|
# TODO(woosuk): Make this more robust.
|
|
180
180
|
# Refer to: https://github.com/skypilot-org/skypilot/issues/1006
|
|
181
181
|
# Unsupported Series: 'f1', 'm2'
|
|
182
|
-
|
|
182
|
+
SERIES_TO_DESCRIPTION = {
|
|
183
183
|
'a2': 'A2 Instance',
|
|
184
184
|
'a3': 'A3 Instance',
|
|
185
|
+
# NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
|
|
186
|
+
# The B200 GPU pricing includes the full VM cost. See special handling in
|
|
187
|
+
# get_vm_price() which sets A4 VM price to 0.
|
|
188
|
+
'a4': 'A4 Instance',
|
|
185
189
|
'c2': 'Compute optimized',
|
|
186
190
|
'c2d': 'C2D AMD Instance',
|
|
187
191
|
'c3': 'C3 Instance',
|
|
@@ -195,9 +199,11 @@ SERIES_TO_DISCRIPTION = {
|
|
|
195
199
|
'n1': 'N1 Predefined Instance',
|
|
196
200
|
'n2': 'N2 Instance',
|
|
197
201
|
'n2d': 'N2D AMD Instance',
|
|
202
|
+
'n4': 'N4 Instance',
|
|
198
203
|
't2a': 'T2A Arm Instance',
|
|
199
204
|
't2d': 'T2D AMD Instance',
|
|
200
205
|
}
|
|
206
|
+
|
|
201
207
|
creds, project_id = google.auth.default()
|
|
202
208
|
gcp_client = discovery.build('compute', 'v1')
|
|
203
209
|
tpu_client = discovery.build('tpu', 'v1')
|
|
@@ -334,7 +340,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
334
340
|
|
|
335
341
|
# Drop the unsupported series.
|
|
336
342
|
df = df[df['InstanceType'].str.startswith(
|
|
337
|
-
tuple(f'{series}-' for series in
|
|
343
|
+
tuple(f'{series}-' for series in SERIES_TO_DESCRIPTION))]
|
|
338
344
|
df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
|
|
339
345
|
|
|
340
346
|
# TODO(woosuk): Make this more efficient.
|
|
@@ -352,7 +358,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
352
358
|
|
|
353
359
|
# Check if the SKU is for the correct series.
|
|
354
360
|
description = sku['description']
|
|
355
|
-
if
|
|
361
|
+
if SERIES_TO_DESCRIPTION[series].lower() not in description.lower():
|
|
356
362
|
continue
|
|
357
363
|
# Special check for M1 instances.
|
|
358
364
|
if series == 'm1' and 'M3' in description:
|
|
@@ -389,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
389
395
|
if series in ['f1', 'g1']:
|
|
390
396
|
memory_price = 0.0
|
|
391
397
|
|
|
398
|
+
# Special case for A4 instances.
|
|
399
|
+
# GCP does not provide separate CPU/RAM pricing for A4 instances in the
|
|
400
|
+
# SKUs API. The GPU pricing (B200) includes the full VM cost.
|
|
401
|
+
# We set the VM price to 0 so the entry is not dropped, and the GPU
|
|
402
|
+
# pricing will provide the total cost.
|
|
403
|
+
if series == 'a4':
|
|
404
|
+
cpu_price = 0.0
|
|
405
|
+
memory_price = 0.0
|
|
406
|
+
|
|
392
407
|
# TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
|
|
393
408
|
# skip them in the catalog for now. We should investigate why they are
|
|
394
409
|
# missing and add them back.
|
|
@@ -434,10 +449,18 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
|
|
|
434
449
|
gpu_name = gpu_name.upper()
|
|
435
450
|
if 'H100-80GB' in gpu_name:
|
|
436
451
|
gpu_name = 'H100'
|
|
437
|
-
|
|
452
|
+
|
|
453
|
+
if 'H100-MEGA' in gpu_name:
|
|
438
454
|
gpu_name = 'H100-MEGA'
|
|
439
455
|
if count != 8:
|
|
440
|
-
|
|
456
|
+
continue
|
|
457
|
+
elif 'H200' in gpu_name:
|
|
458
|
+
gpu_name = 'H200'
|
|
459
|
+
if count != 8:
|
|
460
|
+
continue
|
|
461
|
+
elif 'B200' in gpu_name:
|
|
462
|
+
gpu_name = 'B200'
|
|
463
|
+
if count != 8:
|
|
441
464
|
continue
|
|
442
465
|
if 'VWS' in gpu_name:
|
|
443
466
|
continue
|
|
@@ -468,6 +491,8 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
|
|
|
468
491
|
'A100': 40 * 1024,
|
|
469
492
|
'H100': 80 * 1024,
|
|
470
493
|
'H100-MEGA': 80 * 1024,
|
|
494
|
+
'H200': 141 * 1024,
|
|
495
|
+
'B200': 180 * 1024,
|
|
471
496
|
'P4': 8 * 1024,
|
|
472
497
|
'T4': 16 * 1024,
|
|
473
498
|
'V100': 16 * 1024,
|
|
@@ -507,22 +532,47 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
|
507
532
|
ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
|
|
508
533
|
gpu_price = None
|
|
509
534
|
for sku in gpu_skus:
|
|
535
|
+
row_gpu_name = row['AcceleratorName']
|
|
510
536
|
if row['Region'] not in sku['serviceRegions']:
|
|
511
537
|
continue
|
|
512
|
-
|
|
538
|
+
|
|
539
|
+
# Check usageType matches, with special handling for B200 spot.
|
|
540
|
+
# GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
|
|
541
|
+
# but the description contains 'Spot Preemptible'.
|
|
542
|
+
usage_type = sku['category']['usageType']
|
|
543
|
+
description = sku['description']
|
|
544
|
+
is_spot_description = 'spot preemptible' in description.lower()
|
|
545
|
+
|
|
546
|
+
if usage_type != ondemand_or_spot:
|
|
547
|
+
# For B200 spot pricing, also accept SKUs where description
|
|
548
|
+
# says "Spot Preemptible" even if usageType is wrong.
|
|
549
|
+
if not (spot and row_gpu_name == 'B200' and
|
|
550
|
+
is_spot_description):
|
|
551
|
+
continue
|
|
552
|
+
|
|
553
|
+
# For B200 on-demand, skip SKUs that are actually spot (description
|
|
554
|
+
# says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
|
|
555
|
+
if not spot and row_gpu_name == 'B200' and is_spot_description:
|
|
513
556
|
continue
|
|
514
557
|
|
|
515
|
-
gpu_names = [
|
|
516
|
-
if
|
|
517
|
-
gpu_names = ['A100 80GB']
|
|
518
|
-
|
|
519
|
-
gpu_names = ['H100 80GB']
|
|
520
|
-
|
|
558
|
+
gpu_names = [f'{row_gpu_name} GPU']
|
|
559
|
+
if row_gpu_name == 'A100-80GB':
|
|
560
|
+
gpu_names = ['A100 80GB GPU']
|
|
561
|
+
elif row_gpu_name == 'H100':
|
|
562
|
+
gpu_names = ['H100 80GB GPU']
|
|
563
|
+
elif row_gpu_name == 'H100-MEGA':
|
|
521
564
|
# Seems that H100-MEGA has two different descriptions in SKUs in
|
|
522
565
|
# different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
|
|
523
|
-
gpu_names = [
|
|
524
|
-
|
|
525
|
-
|
|
566
|
+
gpu_names = [
|
|
567
|
+
'H100 80GB Mega GPU', 'H100 Mega 80GB GPU',
|
|
568
|
+
'H100 80GB Plus GPU'
|
|
569
|
+
]
|
|
570
|
+
elif row_gpu_name == 'H200':
|
|
571
|
+
gpu_names = ['H200 141GB GPU']
|
|
572
|
+
elif row_gpu_name == 'B200':
|
|
573
|
+
gpu_names = ['Nvidia B200 (1 gpu slice)']
|
|
574
|
+
if not any(
|
|
575
|
+
gpu_name in sku['description'] for gpu_name in gpu_names):
|
|
526
576
|
continue
|
|
527
577
|
|
|
528
578
|
unit_price = _get_unit_price(sku)
|
|
@@ -554,7 +604,7 @@ def _get_tpu_response_for_zone(zone: str) -> list:
|
|
|
554
604
|
# Sometimes the response is empty ({}) even for enabled zones. Here we
|
|
555
605
|
# retry the request for a few times.
|
|
556
606
|
backoff = common_utils.Backoff(initial_backoff=1)
|
|
557
|
-
for
|
|
607
|
+
for retry_cnt in range(TPU_RETRY_CNT):
|
|
558
608
|
tpus_request = (
|
|
559
609
|
tpu_client.projects().locations().acceleratorTypes().list(
|
|
560
610
|
parent=parent))
|
|
@@ -570,6 +620,10 @@ def _get_tpu_response_for_zone(zone: str) -> list:
|
|
|
570
620
|
print(f' An error occurred: {error}')
|
|
571
621
|
# If error happens, fail early.
|
|
572
622
|
return []
|
|
623
|
+
except TimeoutError:
|
|
624
|
+
print(f' TimeoutError: Failed to fetch TPUs for zone {zone!r}, '
|
|
625
|
+
f'retry {retry_cnt + 1} of {TPU_RETRY_CNT}')
|
|
626
|
+
|
|
573
627
|
time_to_sleep = backoff.current_backoff()
|
|
574
628
|
print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
|
|
575
629
|
time.sleep(time_to_sleep)
|