skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,10 +5,19 @@ the 'command' field in the exec configuration, leaving only the executable name.
|
|
|
5
5
|
This is useful when moving between different environments where auth plugin
|
|
6
6
|
executables might be installed in different locations.
|
|
7
7
|
|
|
8
|
+
For Nebius kubeconfigs, it also changes the --profile argument to 'sky' to
|
|
9
|
+
ensure compatibility with SkyPilot's expected profile configuration.
|
|
10
|
+
|
|
8
11
|
It assumes the target environment has the auth executable available in PATH.
|
|
9
12
|
If not, you'll need to update your environment container to include the auth
|
|
10
13
|
executable in PATH.
|
|
11
14
|
|
|
15
|
+
When using LOCAL_CREDENTIALS (aka exec auth) with Kubernetes, though, SkyPilot
|
|
16
|
+
will automatically inject a wrapper script for common exec auth providers like
|
|
17
|
+
GKE and EKS. This wrapper script helps to resolve path issues that may arise
|
|
18
|
+
from executables installed on non system-default paths. Thus, the kubeconfig
|
|
19
|
+
file may look different on the sky jobs controller.
|
|
20
|
+
|
|
12
21
|
Usage:
|
|
13
22
|
python -m sky.utils.kubernetes.exec_kubeconfig_converter
|
|
14
23
|
"""
|
|
@@ -17,36 +26,7 @@ import os
|
|
|
17
26
|
|
|
18
27
|
import yaml
|
|
19
28
|
|
|
20
|
-
|
|
21
|
-
def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
|
22
|
-
"""Strip path information from exec plugin commands in a kubeconfig file.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
kubeconfig_path (str): Path to the input kubeconfig file
|
|
26
|
-
output_path (str): Path where the modified kubeconfig will be saved
|
|
27
|
-
"""
|
|
28
|
-
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
|
29
|
-
config = yaml.safe_load(file)
|
|
30
|
-
|
|
31
|
-
updated = False
|
|
32
|
-
for user in config.get('users', []):
|
|
33
|
-
exec_info = user.get('user', {}).get('exec', {})
|
|
34
|
-
current_command = exec_info.get('command', '')
|
|
35
|
-
|
|
36
|
-
if current_command:
|
|
37
|
-
# Strip the path and keep only the executable name
|
|
38
|
-
executable = os.path.basename(current_command)
|
|
39
|
-
if executable != current_command:
|
|
40
|
-
exec_info['command'] = executable
|
|
41
|
-
updated = True
|
|
42
|
-
|
|
43
|
-
if updated:
|
|
44
|
-
with open(output_path, 'w', encoding='utf-8') as file:
|
|
45
|
-
yaml.safe_dump(config, file)
|
|
46
|
-
print('Kubeconfig updated with path-less exec auth. '
|
|
47
|
-
f'Saved to {output_path}')
|
|
48
|
-
else:
|
|
49
|
-
print('No updates made. No exec-based auth commands paths found.')
|
|
29
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
50
30
|
|
|
51
31
|
|
|
52
32
|
def main():
|
|
@@ -66,7 +46,18 @@ def main():
|
|
|
66
46
|
help='Output kubeconfig file path (default: %(default)s)')
|
|
67
47
|
|
|
68
48
|
args = parser.parse_args()
|
|
69
|
-
|
|
49
|
+
|
|
50
|
+
with open(args.input, 'r', encoding='utf-8') as file:
|
|
51
|
+
config = yaml.safe_load(file)
|
|
52
|
+
|
|
53
|
+
updated = kubernetes_utils.format_kubeconfig_exec_auth(
|
|
54
|
+
config, args.output, False)
|
|
55
|
+
|
|
56
|
+
if updated:
|
|
57
|
+
print('Kubeconfig updated with path-less exec auth. '
|
|
58
|
+
f'Saved to {args.output}')
|
|
59
|
+
else:
|
|
60
|
+
print('No updates made.')
|
|
70
61
|
|
|
71
62
|
|
|
72
63
|
if __name__ == '__main__':
|
|
@@ -3,67 +3,8 @@
|
|
|
3
3
|
Maps specified ports from host to cluster container.
|
|
4
4
|
"""
|
|
5
5
|
import argparse
|
|
6
|
-
import textwrap
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def generate_kind_config(path: str,
|
|
10
|
-
port_start: int = 30000,
|
|
11
|
-
port_end: int = 32768,
|
|
12
|
-
num_nodes: int = 1,
|
|
13
|
-
gpus: bool = False) -> None:
|
|
14
|
-
"""Generate a kind cluster config with ports mapped from host to container
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
path: Path to generate the config file at
|
|
18
|
-
port_start: Port range start
|
|
19
|
-
port_end: Port range end
|
|
20
|
-
num_nodes: Number of nodes in the cluster
|
|
21
|
-
gpus: If true, initialize kind cluster with GPU support
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
preamble = textwrap.dedent(f"""
|
|
25
|
-
apiVersion: kind.x-k8s.io/v1alpha4
|
|
26
|
-
kind: Cluster
|
|
27
|
-
kubeadmConfigPatches:
|
|
28
|
-
- |
|
|
29
|
-
kind: ClusterConfiguration
|
|
30
|
-
apiServer:
|
|
31
|
-
extraArgs:
|
|
32
|
-
"service-node-port-range": {port_start}-{port_end}
|
|
33
|
-
nodes:
|
|
34
|
-
- role: control-plane
|
|
35
|
-
kubeadmConfigPatches:
|
|
36
|
-
- |
|
|
37
|
-
kind: InitConfiguration
|
|
38
|
-
nodeRegistration:
|
|
39
|
-
kubeletExtraArgs:
|
|
40
|
-
node-labels: "ingress-ready=true"
|
|
41
|
-
""")
|
|
42
|
-
if gpus:
|
|
43
|
-
preamble += textwrap.indent(
|
|
44
|
-
textwrap.dedent("""
|
|
45
|
-
extraMounts:
|
|
46
|
-
- hostPath: /dev/null
|
|
47
|
-
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
48
|
-
preamble += textwrap.indent(
|
|
49
|
-
textwrap.dedent("""
|
|
50
|
-
extraPortMappings:"""), ' ' * 2)
|
|
51
|
-
suffix = ''
|
|
52
|
-
if num_nodes > 1:
|
|
53
|
-
for _ in range(1, num_nodes):
|
|
54
|
-
suffix += """- role: worker\n"""
|
|
55
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
56
|
-
f.write(preamble)
|
|
57
|
-
for port in range(port_start, port_end + 1):
|
|
58
|
-
f.write(f"""
|
|
59
|
-
- containerPort: {port}
|
|
60
|
-
hostPort: {port}
|
|
61
|
-
listenAddress: "0.0.0.0"
|
|
62
|
-
protocol: tcp""")
|
|
63
|
-
f.write('\n')
|
|
64
|
-
if suffix:
|
|
65
|
-
f.write(suffix)
|
|
66
6
|
|
|
7
|
+
from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
67
8
|
|
|
68
9
|
if __name__ == '__main__':
|
|
69
10
|
parser = argparse.ArgumentParser(description='Generate a kind cluster '
|
|
@@ -77,10 +18,6 @@ if __name__ == '__main__':
|
|
|
77
18
|
type=int,
|
|
78
19
|
default=30000,
|
|
79
20
|
help='Port range start')
|
|
80
|
-
parser.add_argument('--port-end',
|
|
81
|
-
type=int,
|
|
82
|
-
default=32768,
|
|
83
|
-
help='Port range end')
|
|
84
21
|
parser.add_argument('--num-nodes',
|
|
85
22
|
type=int,
|
|
86
23
|
default=1,
|
|
@@ -90,5 +27,8 @@ if __name__ == '__main__':
|
|
|
90
27
|
action='store_true',
|
|
91
28
|
help='Initialize kind cluster with GPU support')
|
|
92
29
|
args = parser.parse_args()
|
|
93
|
-
|
|
94
|
-
|
|
30
|
+
|
|
31
|
+
with open(args.path, 'w', encoding='utf-8') as f:
|
|
32
|
+
f.write(
|
|
33
|
+
kubernetes_deploy_utils.generate_kind_config(
|
|
34
|
+
args.port_start, args.num_nodes, args.gpus))
|
|
@@ -156,6 +156,9 @@ rules:
|
|
|
156
156
|
- apiGroups: [""] # Required for sky show-gpus command
|
|
157
157
|
resources: ["pods"]
|
|
158
158
|
verbs: ["get", "list"]
|
|
159
|
+
- apiGroups: ["rbac.authorization.k8s.io"] # Required for SkyPilot to inspect its own permissions
|
|
160
|
+
resources: ["clusterroles", "clusterrolebindings", "roles", "rolebindings"]
|
|
161
|
+
verbs: ["get", "list", "watch"]
|
|
159
162
|
---
|
|
160
163
|
# ClusterRoleBinding for the service account
|
|
161
164
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
@@ -316,7 +319,7 @@ EOF
|
|
|
316
319
|
echo "---
|
|
317
320
|
Done!
|
|
318
321
|
|
|
319
|
-
Kubeconfig using service
|
|
322
|
+
Kubeconfig using service account '${SKYPILOT_SA}' in namespace '${NAMESPACE}' written at $(pwd)/kubeconfig
|
|
320
323
|
|
|
321
324
|
Copy the generated kubeconfig file to your ~/.kube/ directory to use it with
|
|
322
325
|
kubectl and skypilot:
|
|
@@ -8,9 +8,9 @@ from typing import Dict, Optional, Tuple
|
|
|
8
8
|
import colorama
|
|
9
9
|
import yaml
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky.adaptors import kubernetes
|
|
13
12
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import directory_utils
|
|
14
14
|
from sky.utils import rich_utils
|
|
15
15
|
|
|
16
16
|
|
|
@@ -40,7 +40,9 @@ def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
|
|
|
40
40
|
success = True
|
|
41
41
|
except subprocess.CalledProcessError as e:
|
|
42
42
|
output = e.output.decode('utf-8')
|
|
43
|
-
|
|
43
|
+
stderr = e.stderr.decode('utf-8')
|
|
44
|
+
reason = ('Error deleting existing GPU labeler resources: ' +
|
|
45
|
+
output + stderr)
|
|
44
46
|
return success, reason
|
|
45
47
|
|
|
46
48
|
|
|
@@ -62,8 +64,8 @@ def label(context: Optional[str] = None, wait_for_completion: bool = True):
|
|
|
62
64
|
if not unlabeled_gpu_nodes:
|
|
63
65
|
print('No unlabeled GPU nodes found in the cluster. If you have '
|
|
64
66
|
'unlabeled GPU nodes, please ensure that they have the resource '
|
|
65
|
-
f'`{kubernetes_utils.get_gpu_resource_key()}:
|
|
66
|
-
'in their capacity.')
|
|
67
|
+
f'`{kubernetes_utils.get_gpu_resource_key(context)}: '
|
|
68
|
+
'<number of GPUs>` in their capacity.')
|
|
67
69
|
return
|
|
68
70
|
|
|
69
71
|
print(
|
|
@@ -71,8 +73,8 @@ def label(context: Optional[str] = None, wait_for_completion: bool = True):
|
|
|
71
73
|
f'Found {len(unlabeled_gpu_nodes)} '
|
|
72
74
|
'unlabeled GPU nodes in the cluster', colorama.Fore.YELLOW))
|
|
73
75
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
+
manifest_dir = os.path.join(directory_utils.get_sky_dir(),
|
|
77
|
+
'utils/kubernetes')
|
|
76
78
|
|
|
77
79
|
# Apply the RBAC manifest using kubectl since it contains multiple resources
|
|
78
80
|
with rich_utils.client_status('Setting up GPU labeling'):
|
|
@@ -183,9 +185,17 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
|
|
|
183
185
|
batch_v1 = kubernetes.batch_api(context=context)
|
|
184
186
|
w = kubernetes.watch()
|
|
185
187
|
completed_jobs = []
|
|
188
|
+
# Use resource_version="0" to start from the oldest available version.
|
|
189
|
+
# In multi-replica API server environments, replicas may be at different
|
|
190
|
+
# resource versions due to replication lag. Without specifying this, the
|
|
191
|
+
# watch may get version X from one replica but connect to another replica
|
|
192
|
+
# that only has up to version Y < X, causing "Too large resource version"
|
|
193
|
+
# errors. Using "0" ensures all replicas can serve the request from their
|
|
194
|
+
# oldest available version, avoiding version mismatches.
|
|
186
195
|
for event in w.stream(func=batch_v1.list_namespaced_job,
|
|
187
196
|
namespace=namespace,
|
|
188
|
-
timeout_seconds=timeout
|
|
197
|
+
timeout_seconds=timeout,
|
|
198
|
+
resource_version='0'):
|
|
189
199
|
job = event['object']
|
|
190
200
|
job_name = job.metadata.name
|
|
191
201
|
if job_name in jobs_to_node_names:
|
|
@@ -212,7 +222,7 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
|
|
|
212
222
|
_format_string(
|
|
213
223
|
f'Timed out after waiting {timeout} seconds '
|
|
214
224
|
'for job to complete', colorama.Style.DIM))
|
|
215
|
-
return False #Timed out
|
|
225
|
+
return False # Timed out
|
|
216
226
|
|
|
217
227
|
|
|
218
228
|
def main():
|
|
@@ -13,10 +13,11 @@ spec:
|
|
|
13
13
|
serviceAccountName: gpu-labeler-sa
|
|
14
14
|
containers:
|
|
15
15
|
- name: gpu-labeler
|
|
16
|
-
image: us-
|
|
16
|
+
image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
|
|
17
17
|
command: ["/bin/bash", "-i", "-c"]
|
|
18
18
|
args:
|
|
19
19
|
- |
|
|
20
|
+
source ~/skypilot-runtime/bin/activate
|
|
20
21
|
python /label_gpus.py
|
|
21
22
|
env:
|
|
22
23
|
- name: MY_NODE_NAME
|
|
@@ -53,51 +53,51 @@ data:
|
|
|
53
53
|
import os
|
|
54
54
|
import subprocess
|
|
55
55
|
from typing import Optional
|
|
56
|
-
|
|
56
|
+
|
|
57
57
|
from kubernetes import client
|
|
58
58
|
from kubernetes import config
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
canonical_gpu_names = [
|
|
61
|
-
'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
|
|
61
|
+
'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
|
|
62
62
|
'A10', 'P100', 'P40', 'P4', 'L4'
|
|
63
63
|
]
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
|
|
65
|
+
|
|
66
66
|
def get_gpu_name() -> Optional[str]:
|
|
67
67
|
try:
|
|
68
68
|
result = subprocess.run(
|
|
69
69
|
['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
|
|
70
70
|
stdout=subprocess.PIPE)
|
|
71
71
|
gpu_name = result.stdout.decode('utf-8').strip()
|
|
72
|
-
# In the case of multi-gpu nodes, we assume the node is homogenous and
|
|
72
|
+
# In the case of multi-gpu nodes, we assume the node is homogenous and
|
|
73
73
|
# just use the first GPU name.
|
|
74
74
|
gpu_name = gpu_name.split('\n')[0]
|
|
75
75
|
return gpu_name.lower()
|
|
76
76
|
except Exception as e:
|
|
77
77
|
print(f'Error getting GPU name: {e}')
|
|
78
78
|
return None
|
|
79
|
-
|
|
80
|
-
|
|
79
|
+
|
|
80
|
+
|
|
81
81
|
def label_node(gpu_name: str) -> None:
|
|
82
82
|
try:
|
|
83
83
|
config.load_incluster_config() # Load in-cluster configuration
|
|
84
84
|
v1 = client.CoreV1Api()
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
# Fetch the current node's name from the environment variable
|
|
87
87
|
node_name = os.environ.get('MY_NODE_NAME')
|
|
88
88
|
if not node_name:
|
|
89
89
|
raise ValueError('Failed to get node name from environment')
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
# Label the node with the GPU name
|
|
92
92
|
body = {'metadata': {'labels': {'skypilot.co/accelerator': gpu_name}}}
|
|
93
93
|
v1.patch_node(node_name, body)
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
print(f'Labeled node {node_name} with GPU {gpu_name}')
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
except Exception as e:
|
|
98
98
|
print(f'Error labeling node: {e}')
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
|
|
100
|
+
|
|
101
101
|
def main():
|
|
102
102
|
gpu_name = get_gpu_name()
|
|
103
103
|
if gpu_name is not None:
|
|
@@ -119,7 +119,7 @@ data:
|
|
|
119
119
|
labelled = True
|
|
120
120
|
else:
|
|
121
121
|
print('No GPU detected. Try running nvidia-smi in the container.')
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
|
|
123
|
+
|
|
124
124
|
if __name__ == '__main__':
|
|
125
125
|
main()
|