skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/backends/wheel_utils.py
CHANGED
|
@@ -100,6 +100,13 @@ def _build_sky_wheel() -> pathlib.Path:
|
|
|
100
100
|
# modify the commit hash in the file later.
|
|
101
101
|
# Symlink other files/folders.
|
|
102
102
|
target.symlink_to(item, target_is_directory=item.is_dir())
|
|
103
|
+
|
|
104
|
+
# Symlink sky_templates directory from repo root
|
|
105
|
+
sky_templates_src = SKY_PACKAGE_PATH.parent / 'sky_templates'
|
|
106
|
+
if sky_templates_src.exists():
|
|
107
|
+
sky_templates_target = tmp_dir / 'sky_templates'
|
|
108
|
+
sky_templates_target.symlink_to(sky_templates_src,
|
|
109
|
+
target_is_directory=True)
|
|
103
110
|
setup_files_dir = SKY_PACKAGE_PATH / 'setup_files'
|
|
104
111
|
|
|
105
112
|
setup_content = (setup_files_dir / 'setup.py').read_text()
|
|
@@ -244,6 +251,17 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
|
|
|
244
251
|
# protocol. "compare, update and clone" has to be atomic to avoid
|
|
245
252
|
# race conditions.
|
|
246
253
|
last_modification_time = _get_latest_modification_time(SKY_PACKAGE_PATH)
|
|
254
|
+
# Also check sky_templates directory modification time
|
|
255
|
+
sky_templates_path = SKY_PACKAGE_PATH.parent / 'sky_templates'
|
|
256
|
+
if sky_templates_path.exists():
|
|
257
|
+
sky_templates_mtime = _get_latest_modification_time(
|
|
258
|
+
sky_templates_path)
|
|
259
|
+
if (last_modification_time is not None and
|
|
260
|
+
sky_templates_mtime is not None):
|
|
261
|
+
last_modification_time = max(last_modification_time,
|
|
262
|
+
sky_templates_mtime)
|
|
263
|
+
elif last_modification_time is None:
|
|
264
|
+
last_modification_time = sky_templates_mtime
|
|
247
265
|
last_wheel_modification_time = _get_latest_modification_time(WHEEL_DIR)
|
|
248
266
|
|
|
249
267
|
# Only build wheels if the wheel is outdated, wheel does not exist
|
sky/catalog/__init__.py
CHANGED
|
@@ -247,6 +247,13 @@ def get_accelerators_from_instance_type(
|
|
|
247
247
|
instance_type)
|
|
248
248
|
|
|
249
249
|
|
|
250
|
+
def get_arch_from_instance_type(instance_type: str,
|
|
251
|
+
clouds: CloudFilter = None) -> Optional[str]:
|
|
252
|
+
"""Returns the arch from a instance type."""
|
|
253
|
+
return _map_clouds_catalog(clouds, 'get_arch_from_instance_type',
|
|
254
|
+
instance_type)
|
|
255
|
+
|
|
256
|
+
|
|
250
257
|
def get_instance_type_for_accelerator(
|
|
251
258
|
acc_name: str,
|
|
252
259
|
acc_count: Union[int, float],
|
|
@@ -326,6 +333,7 @@ def get_common_gpus() -> List[str]:
|
|
|
326
333
|
'A10G',
|
|
327
334
|
'A100',
|
|
328
335
|
'A100-80GB',
|
|
336
|
+
'B200',
|
|
329
337
|
'H100',
|
|
330
338
|
'H200',
|
|
331
339
|
'L4',
|
sky/catalog/aws_catalog.py
CHANGED
|
@@ -271,6 +271,10 @@ def get_accelerators_from_instance_type(
|
|
|
271
271
|
_get_df(), instance_type)
|
|
272
272
|
|
|
273
273
|
|
|
274
|
+
def get_arch_from_instance_type(instance_type: str) -> Optional[str]:
|
|
275
|
+
return common.get_arch_from_instance_type_impl(_get_df(), instance_type)
|
|
276
|
+
|
|
277
|
+
|
|
274
278
|
def get_instance_type_for_accelerator(
|
|
275
279
|
acc_name: str,
|
|
276
280
|
acc_count: int,
|
sky/catalog/common.py
CHANGED
|
@@ -385,7 +385,7 @@ def get_hourly_cost_impl(
|
|
|
385
385
|
f'{instance_type!r}.')
|
|
386
386
|
cheapest_idx = df[price_str].idxmin()
|
|
387
387
|
cheapest = df.loc[cheapest_idx]
|
|
388
|
-
return cheapest[price_str]
|
|
388
|
+
return float(cheapest[price_str])
|
|
389
389
|
|
|
390
390
|
|
|
391
391
|
def _get_value(value):
|
|
@@ -527,6 +527,24 @@ def get_accelerators_from_instance_type_impl(
|
|
|
527
527
|
return {acc_name: _convert(acc_count)}
|
|
528
528
|
|
|
529
529
|
|
|
530
|
+
def get_arch_from_instance_type_impl(
|
|
531
|
+
df: 'pd.DataFrame',
|
|
532
|
+
instance_type: str,
|
|
533
|
+
) -> Optional[str]:
|
|
534
|
+
df = _get_instance_type(df, instance_type, None)
|
|
535
|
+
if df.empty:
|
|
536
|
+
with ux_utils.print_exception_no_traceback():
|
|
537
|
+
raise ValueError(f'No instance type {instance_type} found.')
|
|
538
|
+
row = df.iloc[0]
|
|
539
|
+
if 'Arch' not in row:
|
|
540
|
+
return None
|
|
541
|
+
arch = row['Arch']
|
|
542
|
+
if pd.isnull(arch):
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
return arch
|
|
546
|
+
|
|
547
|
+
|
|
530
548
|
def get_instance_type_for_accelerator_impl(
|
|
531
549
|
df: 'pd.DataFrame',
|
|
532
550
|
acc_name: str,
|
|
@@ -60,6 +60,7 @@ ALL_REGIONS = [
|
|
|
60
60
|
'ap-northeast-2',
|
|
61
61
|
'ap-southeast-1',
|
|
62
62
|
'ap-southeast-2',
|
|
63
|
+
'ap-southeast-4',
|
|
63
64
|
'ap-northeast-1',
|
|
64
65
|
]
|
|
65
66
|
US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
@@ -67,17 +68,13 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
|
67
68
|
# The following columns will be included in the final catalog.
|
|
68
69
|
USEFUL_COLUMNS = [
|
|
69
70
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
70
|
-
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
|
|
71
|
+
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
|
|
71
72
|
]
|
|
72
73
|
|
|
73
74
|
# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
|
|
74
75
|
# only available in this region, but it serves pricing information for all
|
|
75
76
|
# regions.
|
|
76
77
|
PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
|
|
77
|
-
# Hardcode the regions that offer p4de.24xlarge as our credential does not have
|
|
78
|
-
# the permission to query the offerings of the instance.
|
|
79
|
-
# Ref: https://aws.amazon.com/ec2/instance-types/p4/
|
|
80
|
-
P4DE_REGIONS = ['us-east-1', 'us-west-2']
|
|
81
78
|
# g6f instances have fractional GPUs, but the API returns Count: 1 under
|
|
82
79
|
# GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
|
|
83
80
|
# divided by the total memory of an L4 will give us the fraction of the GPU.
|
|
@@ -214,35 +211,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
|
|
|
214
211
|
return df
|
|
215
212
|
|
|
216
213
|
|
|
217
|
-
def _patch_p4de(region: str, df: 'pd.DataFrame',
|
|
218
|
-
pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
|
|
219
|
-
# Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
|
|
220
|
-
# to the instance type.
|
|
221
|
-
# Columns:
|
|
222
|
-
# InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
|
|
223
|
-
# Price,SpotPrice,Region,AvailabilityZone
|
|
224
|
-
records = []
|
|
225
|
-
for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
|
|
226
|
-
records.append({
|
|
227
|
-
'InstanceType': 'p4de.24xlarge',
|
|
228
|
-
'AcceleratorName': 'A100-80GB',
|
|
229
|
-
'AcceleratorCount': 8,
|
|
230
|
-
'vCPUs': 96,
|
|
231
|
-
'MemoryGiB': 1152,
|
|
232
|
-
'GpuInfo':
|
|
233
|
-
('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
|
|
234
|
-
'\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
|
|
235
|
-
'81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
|
|
236
|
-
'AvailabilityZone': zone,
|
|
237
|
-
'Region': region,
|
|
238
|
-
'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
|
|
239
|
-
['Price'].values[0],
|
|
240
|
-
'SpotPrice': np.nan,
|
|
241
|
-
})
|
|
242
|
-
df = pd.concat([df, pd.DataFrame.from_records(records)])
|
|
243
|
-
return df
|
|
244
|
-
|
|
245
|
-
|
|
246
214
|
def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
247
215
|
try:
|
|
248
216
|
# Fetch the zone info first to make sure the account has access to the
|
|
@@ -266,7 +234,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
266
234
|
def get_acc_info(row) -> Tuple[Optional[str], float]:
|
|
267
235
|
accelerator = None
|
|
268
236
|
for col, info_key in [('GpuInfo', 'Gpus'),
|
|
269
|
-
('
|
|
237
|
+
('NeuronInfo', 'NeuronDevices'),
|
|
270
238
|
('FpgaInfo', 'Fpgas')]:
|
|
271
239
|
info = row.get(col)
|
|
272
240
|
if isinstance(info, dict):
|
|
@@ -275,6 +243,17 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
275
243
|
return None, np.nan
|
|
276
244
|
return accelerator['Name'], accelerator['Count']
|
|
277
245
|
|
|
246
|
+
def get_arch(row) -> Optional[str]:
|
|
247
|
+
if 'ProcessorInfo' in row:
|
|
248
|
+
processor = row['ProcessorInfo']
|
|
249
|
+
if 'SupportedArchitectures' in processor:
|
|
250
|
+
archs = processor['SupportedArchitectures']
|
|
251
|
+
if isinstance(archs, list):
|
|
252
|
+
return archs[0]
|
|
253
|
+
elif isinstance(archs, str):
|
|
254
|
+
return archs
|
|
255
|
+
return None
|
|
256
|
+
|
|
278
257
|
def get_vcpus(row) -> float:
|
|
279
258
|
if not np.isnan(row['vCPU']):
|
|
280
259
|
return float(row['vCPU'])
|
|
@@ -299,18 +278,6 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
299
278
|
if row['InstanceType'] == 'p4de.24xlarge':
|
|
300
279
|
acc_name = 'A100-80GB'
|
|
301
280
|
acc_count = 8
|
|
302
|
-
if row['InstanceType'].startswith('trn1'):
|
|
303
|
-
# Trainium instances does not have a field for information of
|
|
304
|
-
# the accelerators. We need to infer the accelerator info from
|
|
305
|
-
# the instance type name.
|
|
306
|
-
# aws ec2 describe-instance-types --region us-east-1
|
|
307
|
-
# https://aws.amazon.com/ec2/instance-types/trn1/
|
|
308
|
-
acc_name = 'Trainium'
|
|
309
|
-
find_num_in_name = re.search(r'(\d+)xlarge',
|
|
310
|
-
row['InstanceType'])
|
|
311
|
-
assert find_num_in_name is not None, row['InstanceType']
|
|
312
|
-
num_in_name = find_num_in_name.group(1)
|
|
313
|
-
acc_count = int(num_in_name) // 2
|
|
314
281
|
if row['InstanceType'] == 'p5en.48xlarge':
|
|
315
282
|
# TODO(andyl): Check if this workaround still needed after
|
|
316
283
|
# v0.10.0 released. Currently, the acc_name returned by the
|
|
@@ -320,10 +287,15 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
320
287
|
if (row['InstanceType'].startswith('g6f') or
|
|
321
288
|
row['InstanceType'].startswith('gr6f')):
|
|
322
289
|
# These instance actually have only fractional GPUs, but the API
|
|
323
|
-
# returns Count: 1 under GpuInfo. We need to
|
|
324
|
-
# memory to get the actual fraction of
|
|
290
|
+
# returns Count: 1 or Count: 0 under GpuInfo. We need to
|
|
291
|
+
# directly check the GPU memory to get the actual fraction of
|
|
292
|
+
# the GPU. Note that TotalGpuMemoryInMiB seems unreliable here -
|
|
293
|
+
# sometimes it is unexpectedly 0.
|
|
325
294
|
# See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
|
|
326
|
-
|
|
295
|
+
assert len(row['GpuInfo']['Gpus']) == 1
|
|
296
|
+
assert row['GpuInfo']['Gpus'][0]['Name'] == 'L4'
|
|
297
|
+
fraction = row['GpuInfo']['Gpus'][0]['MemoryInfo'][
|
|
298
|
+
'SizeInMiB'] / L4_GPU_MEMORY
|
|
327
299
|
acc_count = round(fraction, 3)
|
|
328
300
|
if row['InstanceType'] == 'p5.4xlarge':
|
|
329
301
|
acc_count = 1
|
|
@@ -332,6 +304,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
332
304
|
'AcceleratorCount': acc_count,
|
|
333
305
|
'vCPUs': get_vcpus(row),
|
|
334
306
|
'MemoryGiB': get_memory_gib(row),
|
|
307
|
+
'Arch': get_arch(row),
|
|
335
308
|
})
|
|
336
309
|
|
|
337
310
|
# The AWS API may not have all the instance types in the pricing table,
|
|
@@ -355,11 +328,21 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
355
328
|
df = pd.concat(
|
|
356
329
|
[df, df.apply(get_additional_columns, axis='columns')],
|
|
357
330
|
axis='columns')
|
|
358
|
-
# patch the df for p4de.24xlarge
|
|
359
|
-
if region in P4DE_REGIONS:
|
|
360
|
-
df = _patch_p4de(region, df, pricing_df)
|
|
361
331
|
if 'GpuInfo' not in df.columns:
|
|
362
332
|
df['GpuInfo'] = np.nan
|
|
333
|
+
if 'NeuronInfo' in df.columns:
|
|
334
|
+
# The AWS Neuron API uses 'NeuronDevices' instead of 'Gpus'
|
|
335
|
+
# in its dict; for consistency with GPU handling, rename key.
|
|
336
|
+
def map_neuroninfo(neuroninfo):
|
|
337
|
+
if isinstance(neuroninfo,
|
|
338
|
+
dict) and 'NeuronDevices' in neuroninfo:
|
|
339
|
+
# Rename 'NeuronDevices' to 'Gpus'
|
|
340
|
+
neuroninfo = neuroninfo.copy()
|
|
341
|
+
neuroninfo['Gpus'] = neuroninfo.pop('NeuronDevices')
|
|
342
|
+
return neuroninfo
|
|
343
|
+
|
|
344
|
+
df['NeuronInfo'] = df['NeuronInfo'].apply(map_neuroninfo)
|
|
345
|
+
df['GpuInfo'] = df['GpuInfo'].fillna(df['NeuronInfo'])
|
|
363
346
|
df = df[USEFUL_COLUMNS]
|
|
364
347
|
except Exception as e: # pylint: disable=broad-except
|
|
365
348
|
print(traceback.format_exc())
|
|
@@ -407,44 +390,70 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
|
|
|
407
390
|
# TODO(tian): find out the driver version.
|
|
408
391
|
# Neuron driver:
|
|
409
392
|
_GPU_DESC_UBUNTU_DATE = [
|
|
410
|
-
('
|
|
411
|
-
('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
|
|
412
|
-
('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
|
|
413
|
-
('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
|
|
414
|
-
('neuron', 'Base Neuron AMI', '22.04', '20240923'),
|
|
393
|
+
('neuron', '/aws/service/neuron/dlami/multi-framework', '22.04'),
|
|
415
394
|
]
|
|
416
395
|
|
|
417
396
|
|
|
418
|
-
def
|
|
419
|
-
|
|
397
|
+
def _fetch_image_creation_date(region: str,
|
|
398
|
+
image_id: Optional[str]) -> Optional[str]:
|
|
399
|
+
if image_id is None:
|
|
400
|
+
return None
|
|
420
401
|
try:
|
|
421
402
|
image = subprocess.check_output(f"""\
|
|
422
|
-
aws ec2 describe-images --region {region} --
|
|
423
|
-
--
|
|
424
|
-
'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
|
|
403
|
+
aws ec2 describe-images --region {region} --image-ids {image_id} \\
|
|
404
|
+
--query 'Images[0].Name' --output text
|
|
425
405
|
""",
|
|
426
406
|
shell=True)
|
|
427
407
|
except subprocess.CalledProcessError as e:
|
|
428
|
-
print(f'Failed {region}, {
|
|
429
|
-
f'{creation_date}. Trying next date.')
|
|
408
|
+
print(f'Failed to fetch image creation date for {region}, {image_id}')
|
|
430
409
|
print(f'{type(e)}: {e}')
|
|
431
410
|
image_id = None
|
|
411
|
+
else:
|
|
412
|
+
assert image is not None
|
|
413
|
+
image_name = image.decode('utf-8').strip()
|
|
414
|
+
match = re.search(r'(\d+)$', image_name)
|
|
415
|
+
if match:
|
|
416
|
+
return match.group(1)
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _fetch_image_id_from_ssm_param(
|
|
421
|
+
region: str,
|
|
422
|
+
ssm_prefix: str,
|
|
423
|
+
ubuntu_version: str = '22.04') -> Optional[str]:
|
|
424
|
+
try:
|
|
425
|
+
image = subprocess.check_output(f"""\
|
|
426
|
+
aws ssm get-parameter --region {region} --name "{ssm_prefix}/ubuntu-{ubuntu_version}/latest/image_id" \\
|
|
427
|
+
--query 'Parameter.Value' --output text
|
|
428
|
+
""",
|
|
429
|
+
shell=True)
|
|
430
|
+
except subprocess.CalledProcessError as e:
|
|
431
|
+
print(
|
|
432
|
+
f'Failed to fetch image ID from SSM parameter for {region}, {ssm_prefix}, {ubuntu_version}'
|
|
433
|
+
)
|
|
434
|
+
print(f'{type(e)}: {e}')
|
|
435
|
+
return None
|
|
432
436
|
else:
|
|
433
437
|
assert image is not None
|
|
434
438
|
image_id = image.decode('utf-8').strip()
|
|
435
439
|
return image_id
|
|
436
440
|
|
|
437
441
|
|
|
438
|
-
def _get_image_row(
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
442
|
+
def _get_image_row(
|
|
443
|
+
region: str,
|
|
444
|
+
gpu: str,
|
|
445
|
+
ssm_prefix: str,
|
|
446
|
+
ubuntu_version: str = '22.04'
|
|
447
|
+
) -> Tuple[str, str, str, str, Optional[str], Optional[str]]:
|
|
448
|
+
print(f'Getting image for {region}, {ssm_prefix}, {ubuntu_version}, {gpu}')
|
|
449
|
+
image_id = _fetch_image_id_from_ssm_param(region, ssm_prefix,
|
|
450
|
+
ubuntu_version)
|
|
451
|
+
if image_id is not None:
|
|
452
|
+
creation_date = _fetch_image_creation_date(region, image_id)
|
|
453
|
+
else:
|
|
454
|
+
creation_date = None
|
|
446
455
|
tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
|
|
447
|
-
return tag, region, 'ubuntu', ubuntu_version, image_id,
|
|
456
|
+
return tag, region, 'ubuntu', ubuntu_version, image_id, creation_date
|
|
448
457
|
|
|
449
458
|
|
|
450
459
|
def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
|
|
@@ -559,13 +568,26 @@ if __name__ == '__main__':
|
|
|
559
568
|
instance_df.to_csv('aws/vms.csv', index=False)
|
|
560
569
|
print('AWS Service Catalog saved to aws/vms.csv')
|
|
561
570
|
|
|
562
|
-
# Disable refreshing images.csv
|
|
571
|
+
# Disable refreshing images.csv for skypilot custom AMIs
|
|
572
|
+
# refresh only the neuron based images
|
|
563
573
|
# See sky/clouds/catalog/images/README.md for more details.
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
574
|
+
image_df = get_all_regions_images_df(user_regions)
|
|
575
|
+
_check_regions_integrity(image_df, 'images')
|
|
576
|
+
# filter out rows where ImageId is None
|
|
577
|
+
image_df = image_df[image_df['ImageId'].notna()]
|
|
578
|
+
|
|
579
|
+
# check if aws/images.csv exists
|
|
580
|
+
if os.path.exists('aws/images.csv'):
|
|
581
|
+
# load the data from aws/images.csv
|
|
582
|
+
existing_image_df = pd.read_csv('aws/images.csv')
|
|
583
|
+
# filter out the neuron based images
|
|
584
|
+
existing_image_df = existing_image_df[~existing_image_df['Tag'].
|
|
585
|
+
eq('skypilot:neuron-ubuntu-2204')]
|
|
586
|
+
# concat the new neuron based images with the existing images
|
|
587
|
+
image_df = pd.concat([existing_image_df, image_df])
|
|
588
|
+
|
|
589
|
+
image_df.to_csv('aws/images.csv', index=False)
|
|
590
|
+
print('AWS Images saved to aws/images.csv')
|
|
569
591
|
|
|
570
592
|
if args.az_mappings:
|
|
571
593
|
az_mappings_df = fetch_availability_zone_mappings()
|
|
@@ -182,8 +182,9 @@ TPU_V4_HOST_DF = pd.read_csv(
|
|
|
182
182
|
SERIES_TO_DESCRIPTION = {
|
|
183
183
|
'a2': 'A2 Instance',
|
|
184
184
|
'a3': 'A3 Instance',
|
|
185
|
-
#
|
|
186
|
-
#
|
|
185
|
+
# NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
|
|
186
|
+
# The B200 GPU pricing includes the full VM cost. See special handling in
|
|
187
|
+
# get_vm_price() which sets A4 VM price to 0.
|
|
187
188
|
'a4': 'A4 Instance',
|
|
188
189
|
'c2': 'Compute optimized',
|
|
189
190
|
'c2d': 'C2D AMD Instance',
|
|
@@ -394,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
|
394
395
|
if series in ['f1', 'g1']:
|
|
395
396
|
memory_price = 0.0
|
|
396
397
|
|
|
398
|
+
# Special case for A4 instances.
|
|
399
|
+
# GCP does not provide separate CPU/RAM pricing for A4 instances in the
|
|
400
|
+
# SKUs API. The GPU pricing (B200) includes the full VM cost.
|
|
401
|
+
# We set the VM price to 0 so the entry is not dropped, and the GPU
|
|
402
|
+
# pricing will provide the total cost.
|
|
403
|
+
if series == 'a4':
|
|
404
|
+
cpu_price = 0.0
|
|
405
|
+
memory_price = 0.0
|
|
406
|
+
|
|
397
407
|
# TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
|
|
398
408
|
# skip them in the catalog for now. We should investigate why they are
|
|
399
409
|
# missing and add them back.
|
|
@@ -525,7 +535,24 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
|
525
535
|
row_gpu_name = row['AcceleratorName']
|
|
526
536
|
if row['Region'] not in sku['serviceRegions']:
|
|
527
537
|
continue
|
|
528
|
-
|
|
538
|
+
|
|
539
|
+
# Check usageType matches, with special handling for B200 spot.
|
|
540
|
+
# GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
|
|
541
|
+
# but the description contains 'Spot Preemptible'.
|
|
542
|
+
usage_type = sku['category']['usageType']
|
|
543
|
+
description = sku['description']
|
|
544
|
+
is_spot_description = 'spot preemptible' in description.lower()
|
|
545
|
+
|
|
546
|
+
if usage_type != ondemand_or_spot:
|
|
547
|
+
# For B200 spot pricing, also accept SKUs where description
|
|
548
|
+
# says "Spot Preemptible" even if usageType is wrong.
|
|
549
|
+
if not (spot and row_gpu_name == 'B200' and
|
|
550
|
+
is_spot_description):
|
|
551
|
+
continue
|
|
552
|
+
|
|
553
|
+
# For B200 on-demand, skip SKUs that are actually spot (description
|
|
554
|
+
# says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
|
|
555
|
+
if not spot and row_gpu_name == 'B200' and is_spot_description:
|
|
529
556
|
continue
|
|
530
557
|
|
|
531
558
|
gpu_names = [f'{row_gpu_name} GPU']
|
|
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
|
|
7
7
|
import decimal
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
|
+
import os
|
|
10
11
|
import re
|
|
11
12
|
from typing import Any, Dict, List, Optional
|
|
12
13
|
|
|
@@ -22,8 +23,6 @@ TIMEOUT = 10
|
|
|
22
23
|
PARENT_ID_TEMPLATE = 'project-{}public-images'
|
|
23
24
|
ACCELERATOR_MANUFACTURER = 'NVIDIA'
|
|
24
25
|
|
|
25
|
-
VRAM = {'L40S': 49152, 'H100': 81920, 'H200': 144384, 'B200': 184320}
|
|
26
|
-
|
|
27
26
|
|
|
28
27
|
@dataclass
|
|
29
28
|
class PresetInfo:
|
|
@@ -38,6 +37,7 @@ class PresetInfo:
|
|
|
38
37
|
platform_name (str): The name of the platform the preset belongs to.
|
|
39
38
|
gpu (int): The number of GPUs in the preset.
|
|
40
39
|
vcpu (int): The number of virtual CPUs in the preset.
|
|
40
|
+
gpu_memory_gibibytes (int): size of gpu memory in GiB.
|
|
41
41
|
memory_gib (int): The amount of memory in GiB in the preset.
|
|
42
42
|
accelerator_manufacturer (str | None): The manufacturer of the
|
|
43
43
|
accelerator (e.g., "NVIDIA"), or None if no accelerator.
|
|
@@ -54,6 +54,7 @@ class PresetInfo:
|
|
|
54
54
|
platform_name: str
|
|
55
55
|
gpu: int
|
|
56
56
|
vcpu: int
|
|
57
|
+
gpu_memory_gibibytes: int
|
|
57
58
|
memory_gib: int
|
|
58
59
|
accelerator_manufacturer: Optional[str]
|
|
59
60
|
accelerator_name: Optional[str]
|
|
@@ -157,6 +158,7 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
|
|
|
157
158
|
platform_name=platform_name,
|
|
158
159
|
gpu=preset.resources.gpu_count or 0,
|
|
159
160
|
vcpu=preset.resources.vcpu_count,
|
|
161
|
+
gpu_memory_gibibytes=platform.spec.gpu_memory_gibibytes,
|
|
160
162
|
memory_gib=preset.resources.memory_gibibytes,
|
|
161
163
|
accelerator_manufacturer=ACCELERATOR_MANUFACTURER
|
|
162
164
|
if platform_name.startswith('gpu-') else '',
|
|
@@ -178,6 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
178
180
|
presets (List[PresetInfo]): A list of PresetInfo objects to write.
|
|
179
181
|
output_file (str): The path to the output CSV file.
|
|
180
182
|
"""
|
|
183
|
+
os.makedirs(os.path.dirname(output_file))
|
|
181
184
|
# Set up the CSV writer to output to stdout
|
|
182
185
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
183
186
|
header = [
|
|
@@ -193,23 +196,23 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
193
196
|
]
|
|
194
197
|
writer = csv.DictWriter(out, fieldnames=header)
|
|
195
198
|
writer.writeheader()
|
|
196
|
-
|
|
199
|
+
# logger.info(presets)
|
|
197
200
|
for preset in sorted(presets,
|
|
198
201
|
key=lambda x:
|
|
199
202
|
(bool(x.gpu), x.region, x.platform_name, x.vcpu)):
|
|
200
203
|
gpu_info = ''
|
|
201
204
|
if preset.gpu > 0 and preset.accelerator_name:
|
|
205
|
+
vram = preset.gpu_memory_gibibytes * 1024
|
|
202
206
|
gpu_info_dict = {
|
|
203
207
|
'Gpus': [{
|
|
204
208
|
'Name': preset.accelerator_name,
|
|
205
209
|
'Manufacturer': preset.accelerator_manufacturer,
|
|
206
210
|
'Count': preset.gpu,
|
|
207
211
|
'MemoryInfo': {
|
|
208
|
-
'SizeInMiB':
|
|
212
|
+
'SizeInMiB': vram
|
|
209
213
|
},
|
|
210
214
|
}],
|
|
211
|
-
'TotalGpuMemoryInMiB':
|
|
212
|
-
* preset.gpu,
|
|
215
|
+
'TotalGpuMemoryInMiB': vram * preset.gpu,
|
|
213
216
|
}
|
|
214
217
|
gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
|
|
215
218
|
|