skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/adaptors/kubernetes.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Kubernetes adaptors"""
|
|
2
|
+
import functools
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
5
|
+
import platform
|
|
4
6
|
from typing import Any, Callable, Optional, Set
|
|
5
7
|
|
|
8
|
+
from sky import sky_logging
|
|
6
9
|
from sky.adaptors import common
|
|
7
|
-
from sky.sky_logging import set_logging_level
|
|
8
10
|
from sky.utils import annotations
|
|
9
11
|
from sky.utils import common_utils
|
|
10
12
|
from sky.utils import ux_utils
|
|
@@ -13,12 +15,23 @@ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
|
|
|
13
15
|
'Try running: pip install "skypilot[kubernetes]"')
|
|
14
16
|
kubernetes = common.LazyImport('kubernetes',
|
|
15
17
|
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
18
|
+
models = common.LazyImport('kubernetes.client.models',
|
|
19
|
+
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
16
20
|
urllib3 = common.LazyImport('urllib3',
|
|
17
21
|
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
22
|
+
dateutil_parser = common.LazyImport('dateutil.parser',
|
|
23
|
+
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
18
24
|
|
|
19
25
|
# Timeout to use for API calls
|
|
20
26
|
API_TIMEOUT = 5
|
|
21
27
|
|
|
28
|
+
# Check if KUBECONFIG is set, and use it if it is.
|
|
29
|
+
DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
|
|
30
|
+
# From kubernetes package, keep a copy here to avoid actually importing
|
|
31
|
+
# kubernetes package when parsing the KUBECONFIG env var to do credential
|
|
32
|
+
# file mounts.
|
|
33
|
+
ENV_KUBECONFIG_PATH_SEPARATOR = ';' if platform.system() == 'Windows' else ':'
|
|
34
|
+
|
|
22
35
|
DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
|
|
23
36
|
# The name for the environment variable that stores the in-cluster context name
|
|
24
37
|
# for Kubernetes clusters. This is used to associate a name with the current
|
|
@@ -26,6 +39,8 @@ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
|
|
|
26
39
|
# set to DEFAULT_IN_CLUSTER_REGION.
|
|
27
40
|
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
|
|
28
41
|
|
|
42
|
+
logger = sky_logging.init_logger(__name__)
|
|
43
|
+
|
|
29
44
|
|
|
30
45
|
def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
|
|
31
46
|
for attr_name in dir(obj):
|
|
@@ -43,7 +58,7 @@ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
|
|
|
43
58
|
return obj
|
|
44
59
|
|
|
45
60
|
|
|
46
|
-
def _api_logging_decorator(
|
|
61
|
+
def _api_logging_decorator(logger_src: str, level: int):
|
|
47
62
|
"""Decorator to set logging level for API calls.
|
|
48
63
|
|
|
49
64
|
This is used to suppress the verbose logging from urllib3 when calls to the
|
|
@@ -54,7 +69,9 @@ def _api_logging_decorator(logger: str, level: int):
|
|
|
54
69
|
|
|
55
70
|
def wrapped(*args, **kwargs):
|
|
56
71
|
obj = api(*args, **kwargs)
|
|
57
|
-
_decorate_methods(obj,
|
|
72
|
+
_decorate_methods(obj,
|
|
73
|
+
sky_logging.set_logging_level(logger_src, level),
|
|
74
|
+
'api_log')
|
|
58
75
|
return obj
|
|
59
76
|
|
|
60
77
|
return wrapped
|
|
@@ -62,31 +79,61 @@ def _api_logging_decorator(logger: str, level: int):
|
|
|
62
79
|
return decorated_api
|
|
63
80
|
|
|
64
81
|
|
|
82
|
+
def _get_config_file() -> str:
|
|
83
|
+
# Kubernetes load the kubeconfig from the KUBECONFIG env var on
|
|
84
|
+
# package initialization. So we have to reload the KUBECOFNIG env var
|
|
85
|
+
# everytime in case the KUBECONFIG env var is changed.
|
|
86
|
+
return os.environ.get('KUBECONFIG', '~/.kube/config')
|
|
87
|
+
|
|
88
|
+
|
|
65
89
|
def _load_config(context: Optional[str] = None):
|
|
66
90
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
67
91
|
|
|
68
92
|
def _load_config_from_kubeconfig(context: Optional[str] = None):
|
|
69
93
|
try:
|
|
70
|
-
kubernetes.config.load_kube_config(
|
|
94
|
+
kubernetes.config.load_kube_config(config_file=_get_config_file(),
|
|
95
|
+
context=context)
|
|
71
96
|
except kubernetes.config.config_exception.ConfigException as e:
|
|
72
97
|
suffix = common_utils.format_exception(e, use_bracket=True)
|
|
73
98
|
context_name = '(current-context)' if context is None else context
|
|
99
|
+
is_ssh_node_pool = False
|
|
100
|
+
if context_name.startswith('ssh-'):
|
|
101
|
+
context_name = common_utils.removeprefix(context_name, 'ssh-')
|
|
102
|
+
is_ssh_node_pool = True
|
|
74
103
|
# Check if exception was due to no current-context
|
|
75
104
|
if 'Expected key current-context' in str(e):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
105
|
+
if is_ssh_node_pool:
|
|
106
|
+
context_name = common_utils.removeprefix(
|
|
107
|
+
context_name, 'ssh-')
|
|
108
|
+
err_str = ('Failed to load SSH Node Pool configuration for '
|
|
109
|
+
f'{context_name!r}.\n'
|
|
110
|
+
' Run `sky ssh up --infra {context_name}` to '
|
|
111
|
+
'set up or repair the cluster.')
|
|
112
|
+
else:
|
|
113
|
+
err_str = (
|
|
114
|
+
'Failed to load Kubernetes configuration for '
|
|
115
|
+
f'{context_name!r}. '
|
|
116
|
+
'Kubeconfig does not contain any valid context(s).'
|
|
117
|
+
f'\n{suffix}\n'
|
|
118
|
+
' If you were running a local Kubernetes '
|
|
119
|
+
'cluster, run `sky local up` to start the cluster.')
|
|
82
120
|
else:
|
|
83
121
|
kubeconfig_path = os.environ.get('KUBECONFIG', '~/.kube/config')
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
122
|
+
if is_ssh_node_pool:
|
|
123
|
+
err_str = (
|
|
124
|
+
f'Failed to load SSH Node Pool configuration for '
|
|
125
|
+
f'{context_name!r}. Run `sky ssh up --infra '
|
|
126
|
+
f'{context_name}` to set up or repair the cluster.')
|
|
127
|
+
else:
|
|
128
|
+
err_str = (
|
|
129
|
+
'Failed to load Kubernetes configuration for '
|
|
130
|
+
f'{context_name!r}. Please check if your kubeconfig '
|
|
131
|
+
f'file exists at {kubeconfig_path} and is valid.'
|
|
132
|
+
f'\n{suffix}\n')
|
|
133
|
+
if is_ssh_node_pool:
|
|
134
|
+
err_str += (f'\nTo disable SSH Node Pool {context_name!r}: '
|
|
135
|
+
'run `sky check`.')
|
|
136
|
+
else:
|
|
90
137
|
err_str += (
|
|
91
138
|
'\nHint: Kubernetes attempted to query the current-context '
|
|
92
139
|
'set in kubeconfig. Check if the current-context is valid.')
|
|
@@ -100,8 +147,11 @@ def _load_config(context: Optional[str] = None):
|
|
|
100
147
|
# show up in SkyPilot tasks. For now, we work around by using
|
|
101
148
|
# DNS name instead of environment variables.
|
|
102
149
|
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
|
|
103
|
-
|
|
104
|
-
|
|
150
|
+
# Only set if not already present (preserving existing values)
|
|
151
|
+
if 'KUBERNETES_SERVICE_HOST' not in os.environ:
|
|
152
|
+
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
|
|
153
|
+
if 'KUBERNETES_SERVICE_PORT' not in os.environ:
|
|
154
|
+
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
|
|
105
155
|
kubernetes.config.load_incluster_config()
|
|
106
156
|
except kubernetes.config.config_exception.ConfigException:
|
|
107
157
|
_load_config_from_kubeconfig()
|
|
@@ -109,8 +159,65 @@ def _load_config(context: Optional[str] = None):
|
|
|
109
159
|
_load_config_from_kubeconfig(context)
|
|
110
160
|
|
|
111
161
|
|
|
162
|
+
def list_kube_config_contexts():
|
|
163
|
+
return kubernetes.config.list_kube_config_contexts(_get_config_file())
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ClientWrapper:
|
|
167
|
+
"""Wrapper around the kubernetes API clients.
|
|
168
|
+
|
|
169
|
+
This is needed because we cache kubernetes.client.ApiClient and other typed
|
|
170
|
+
clients (e.g. kubernetes.client.CoreV1Api) and lru_cache.cache_clear() does
|
|
171
|
+
not call close() on the client to cleanup external resources like
|
|
172
|
+
semaphores. This decorator wraps the client with __del__ to ensure the
|
|
173
|
+
external state of kubernetes clients are properly cleaned up on GC.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(self, client):
|
|
177
|
+
self._client = client
|
|
178
|
+
|
|
179
|
+
def __getattr__(self, name):
|
|
180
|
+
"""Delegate to the underlying client"""
|
|
181
|
+
return getattr(self._client, name)
|
|
182
|
+
|
|
183
|
+
def __del__(self):
|
|
184
|
+
"""Clean up the underlying client"""
|
|
185
|
+
try:
|
|
186
|
+
real_client = None
|
|
187
|
+
if isinstance(self._client, kubernetes.client.ApiClient):
|
|
188
|
+
real_client = self._client
|
|
189
|
+
elif isinstance(self._client, kubernetes.watch.Watch):
|
|
190
|
+
real_client = getattr(self._client, '_api_client', None)
|
|
191
|
+
else:
|
|
192
|
+
# Otherwise, the client is a typed client, the typed client
|
|
193
|
+
# is generated by codegen and all of them should have an
|
|
194
|
+
# 'api_client' attribute referring to the real client.
|
|
195
|
+
real_client = getattr(self._client, 'api_client', None)
|
|
196
|
+
if real_client is not None:
|
|
197
|
+
real_client.close()
|
|
198
|
+
else:
|
|
199
|
+
# logger may already be cleaned up during __del__ at shutdown
|
|
200
|
+
if logger is not None:
|
|
201
|
+
logger.debug(f'No client found for {self._client}')
|
|
202
|
+
except Exception as e: # pylint: disable=broad-except
|
|
203
|
+
if logger is not None:
|
|
204
|
+
logger.debug(f'Error closing Kubernetes client: {e}')
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def wrap_kubernetes_client(func):
|
|
208
|
+
"""Wraps kubernetes API clients for proper cleanup."""
|
|
209
|
+
|
|
210
|
+
@functools.wraps(func)
|
|
211
|
+
def wrapper(*args, **kwargs):
|
|
212
|
+
obj = func(*args, **kwargs)
|
|
213
|
+
return ClientWrapper(obj)
|
|
214
|
+
|
|
215
|
+
return wrapper
|
|
216
|
+
|
|
217
|
+
|
|
112
218
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
113
219
|
@annotations.lru_cache(scope='request')
|
|
220
|
+
@wrap_kubernetes_client
|
|
114
221
|
def core_api(context: Optional[str] = None):
|
|
115
222
|
_load_config(context)
|
|
116
223
|
return kubernetes.client.CoreV1Api()
|
|
@@ -118,6 +225,15 @@ def core_api(context: Optional[str] = None):
|
|
|
118
225
|
|
|
119
226
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
120
227
|
@annotations.lru_cache(scope='request')
|
|
228
|
+
@wrap_kubernetes_client
|
|
229
|
+
def storage_api(context: Optional[str] = None):
|
|
230
|
+
_load_config(context)
|
|
231
|
+
return kubernetes.client.StorageV1Api()
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
235
|
+
@annotations.lru_cache(scope='request')
|
|
236
|
+
@wrap_kubernetes_client
|
|
121
237
|
def auth_api(context: Optional[str] = None):
|
|
122
238
|
_load_config(context)
|
|
123
239
|
return kubernetes.client.RbacAuthorizationV1Api()
|
|
@@ -125,6 +241,7 @@ def auth_api(context: Optional[str] = None):
|
|
|
125
241
|
|
|
126
242
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
127
243
|
@annotations.lru_cache(scope='request')
|
|
244
|
+
@wrap_kubernetes_client
|
|
128
245
|
def networking_api(context: Optional[str] = None):
|
|
129
246
|
_load_config(context)
|
|
130
247
|
return kubernetes.client.NetworkingV1Api()
|
|
@@ -132,6 +249,7 @@ def networking_api(context: Optional[str] = None):
|
|
|
132
249
|
|
|
133
250
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
134
251
|
@annotations.lru_cache(scope='request')
|
|
252
|
+
@wrap_kubernetes_client
|
|
135
253
|
def custom_objects_api(context: Optional[str] = None):
|
|
136
254
|
_load_config(context)
|
|
137
255
|
return kubernetes.client.CustomObjectsApi()
|
|
@@ -139,6 +257,7 @@ def custom_objects_api(context: Optional[str] = None):
|
|
|
139
257
|
|
|
140
258
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
141
259
|
@annotations.lru_cache(scope='global')
|
|
260
|
+
@wrap_kubernetes_client
|
|
142
261
|
def node_api(context: Optional[str] = None):
|
|
143
262
|
_load_config(context)
|
|
144
263
|
return kubernetes.client.NodeV1Api()
|
|
@@ -146,6 +265,7 @@ def node_api(context: Optional[str] = None):
|
|
|
146
265
|
|
|
147
266
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
148
267
|
@annotations.lru_cache(scope='request')
|
|
268
|
+
@wrap_kubernetes_client
|
|
149
269
|
def apps_api(context: Optional[str] = None):
|
|
150
270
|
_load_config(context)
|
|
151
271
|
return kubernetes.client.AppsV1Api()
|
|
@@ -153,6 +273,7 @@ def apps_api(context: Optional[str] = None):
|
|
|
153
273
|
|
|
154
274
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
155
275
|
@annotations.lru_cache(scope='request')
|
|
276
|
+
@wrap_kubernetes_client
|
|
156
277
|
def batch_api(context: Optional[str] = None):
|
|
157
278
|
_load_config(context)
|
|
158
279
|
return kubernetes.client.BatchV1Api()
|
|
@@ -160,6 +281,7 @@ def batch_api(context: Optional[str] = None):
|
|
|
160
281
|
|
|
161
282
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
162
283
|
@annotations.lru_cache(scope='request')
|
|
284
|
+
@wrap_kubernetes_client
|
|
163
285
|
def api_client(context: Optional[str] = None):
|
|
164
286
|
_load_config(context)
|
|
165
287
|
return kubernetes.client.ApiClient()
|
|
@@ -167,6 +289,15 @@ def api_client(context: Optional[str] = None):
|
|
|
167
289
|
|
|
168
290
|
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
169
291
|
@annotations.lru_cache(scope='request')
|
|
292
|
+
@wrap_kubernetes_client
|
|
293
|
+
def custom_resources_api(context: Optional[str] = None):
|
|
294
|
+
_load_config(context)
|
|
295
|
+
return kubernetes.client.CustomObjectsApi()
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@_api_logging_decorator('urllib3', logging.ERROR)
|
|
299
|
+
@annotations.lru_cache(scope='request')
|
|
300
|
+
@wrap_kubernetes_client
|
|
170
301
|
def watch(context: Optional[str] = None):
|
|
171
302
|
_load_config(context)
|
|
172
303
|
return kubernetes.watch.Watch()
|
sky/adaptors/nebius.py
CHANGED
|
@@ -1,19 +1,106 @@
|
|
|
1
1
|
"""Nebius cloud adaptor."""
|
|
2
|
+
import asyncio
|
|
2
3
|
import os
|
|
3
4
|
import threading
|
|
5
|
+
from typing import Any, Awaitable, List, Optional
|
|
4
6
|
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky import skypilot_config
|
|
5
9
|
from sky.adaptors import common
|
|
6
10
|
from sky.utils import annotations
|
|
7
11
|
from sky.utils import ux_utils
|
|
8
12
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
# Default read timeout for nebius SDK
|
|
14
|
+
READ_TIMEOUT = 10
|
|
15
|
+
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
|
17
|
+
|
|
18
|
+
_loop_lock = threading.Lock()
|
|
19
|
+
_loop = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_event_loop() -> asyncio.AbstractEventLoop:
|
|
23
|
+
"""Get event loop for nebius sdk."""
|
|
24
|
+
global _loop
|
|
25
|
+
|
|
26
|
+
if _loop is not None:
|
|
27
|
+
return _loop
|
|
28
|
+
|
|
29
|
+
with _loop_lock:
|
|
30
|
+
if _loop is None:
|
|
31
|
+
# Create a new event loop in a dedicated thread
|
|
32
|
+
_loop = asyncio.new_event_loop()
|
|
33
|
+
threading.Thread(target=_loop.run_forever, daemon=True).start()
|
|
34
|
+
|
|
35
|
+
return _loop
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sync_call(awaitable: Awaitable[Any]) -> Any:
|
|
39
|
+
"""Synchronously run an awaitable in coroutine.
|
|
40
|
+
|
|
41
|
+
This wrapper is used to workaround:
|
|
42
|
+
https://github.com/nebius/pysdk/issues/76
|
|
43
|
+
|
|
44
|
+
Uses a dedicated background event loop to avoid conflicts
|
|
45
|
+
with existing asyncio contexts and prevent BlockingIOError.
|
|
46
|
+
"""
|
|
47
|
+
loop = _get_event_loop()
|
|
48
|
+
future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
|
|
49
|
+
return future.result()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _coro(awaitable: Awaitable[Any]) -> Any:
|
|
53
|
+
"""Wrapper coroutine for awaitable."""
|
|
54
|
+
return await awaitable
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def tenant_id_path() -> str:
|
|
58
|
+
return '~/.nebius/NEBIUS_TENANT_ID.txt'
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def iam_token_path() -> str:
|
|
62
|
+
return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def domain_path() -> str:
|
|
66
|
+
return '~/.nebius/NEBIUS_DOMAIN.txt'
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def credentials_path() -> str:
|
|
70
|
+
workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
|
|
71
|
+
'credentials_file_path', None)
|
|
72
|
+
if workspace_path is not None:
|
|
73
|
+
return workspace_path
|
|
74
|
+
return _get_default_credentials_path()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _get_workspace_credentials_path() -> Optional[str]:
|
|
78
|
+
"""Get credentials path if explicitly set in workspace config."""
|
|
79
|
+
workspace_cred_path = skypilot_config.get_workspace_cloud('nebius').get(
|
|
80
|
+
'credentials_file_path', None)
|
|
81
|
+
return workspace_cred_path
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _get_default_credentials_path() -> str:
|
|
85
|
+
"""Get the default credentials path."""
|
|
86
|
+
return '~/.nebius/credentials.json'
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def api_domain() -> Optional[str]:
|
|
90
|
+
domain_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
|
|
91
|
+
'domain', None)
|
|
92
|
+
if domain_in_ws_config is not None:
|
|
93
|
+
return domain_in_ws_config
|
|
94
|
+
domain_in_config = skypilot_config.get_effective_region_config(
|
|
95
|
+
cloud='nebius', region=None, keys=('domain',), default_value=None)
|
|
96
|
+
if domain_in_config is not None:
|
|
97
|
+
return domain_in_config
|
|
98
|
+
try:
|
|
99
|
+
with open(os.path.expanduser(domain_path()), encoding='utf-8') as file:
|
|
100
|
+
return file.read().strip()
|
|
101
|
+
except FileNotFoundError:
|
|
102
|
+
return None
|
|
103
|
+
|
|
17
104
|
|
|
18
105
|
DEFAULT_REGION = 'eu-north1'
|
|
19
106
|
|
|
@@ -49,7 +136,9 @@ SKY_CHECK_NAME = 'Nebius (for Nebius Object Storae)'
|
|
|
49
136
|
|
|
50
137
|
|
|
51
138
|
def request_error():
|
|
52
|
-
|
|
139
|
+
# pylint: disable=import-outside-toplevel
|
|
140
|
+
from nebius.aio import service_error
|
|
141
|
+
return service_error.RequestError
|
|
53
142
|
|
|
54
143
|
|
|
55
144
|
def compute():
|
|
@@ -64,6 +153,12 @@ def iam():
|
|
|
64
153
|
return iam_v1
|
|
65
154
|
|
|
66
155
|
|
|
156
|
+
def billing():
|
|
157
|
+
# pylint: disable=import-outside-toplevel
|
|
158
|
+
from nebius.api.nebius.billing import v1alpha1 as billing_v1alpha1
|
|
159
|
+
return billing_v1alpha1
|
|
160
|
+
|
|
161
|
+
|
|
67
162
|
def nebius_common():
|
|
68
163
|
# pylint: disable=import-outside-toplevel
|
|
69
164
|
from nebius.api.nebius.common import v1 as common_v1
|
|
@@ -76,49 +171,79 @@ def vpc():
|
|
|
76
171
|
return vpc_v1
|
|
77
172
|
|
|
78
173
|
|
|
79
|
-
@annotations.lru_cache(scope='request')
|
|
80
174
|
def get_iam_token():
|
|
81
175
|
try:
|
|
82
|
-
with open(os.path.expanduser(
|
|
176
|
+
with open(os.path.expanduser(iam_token_path()),
|
|
83
177
|
encoding='utf-8') as file:
|
|
84
178
|
return file.read().strip()
|
|
85
179
|
except FileNotFoundError:
|
|
86
180
|
return None
|
|
87
181
|
|
|
88
182
|
|
|
89
|
-
@annotations.lru_cache(scope='request')
|
|
90
183
|
def is_token_or_cred_file_exist():
|
|
91
|
-
return (os.path.exists(os.path.expanduser(
|
|
92
|
-
os.path.exists(os.path.expanduser(
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
@annotations.lru_cache(scope='request')
|
|
96
|
-
def get_project_id():
|
|
97
|
-
try:
|
|
98
|
-
with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
|
|
99
|
-
encoding='utf-8') as file:
|
|
100
|
-
return file.read().strip()
|
|
101
|
-
except FileNotFoundError:
|
|
102
|
-
return None
|
|
184
|
+
return (os.path.exists(os.path.expanduser(iam_token_path())) or
|
|
185
|
+
os.path.exists(os.path.expanduser(credentials_path())))
|
|
103
186
|
|
|
104
187
|
|
|
105
|
-
@annotations.lru_cache(scope='request')
|
|
106
188
|
def get_tenant_id():
|
|
189
|
+
tenant_id_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
|
|
190
|
+
'tenant_id', None)
|
|
191
|
+
if tenant_id_in_ws_config is not None:
|
|
192
|
+
return tenant_id_in_ws_config
|
|
193
|
+
tenant_id_in_config = skypilot_config.get_effective_region_config(
|
|
194
|
+
cloud='nebius', region=None, keys=('tenant_id',), default_value=None)
|
|
195
|
+
if tenant_id_in_config is not None:
|
|
196
|
+
return tenant_id_in_config
|
|
107
197
|
try:
|
|
108
|
-
with open(os.path.expanduser(
|
|
198
|
+
with open(os.path.expanduser(tenant_id_path()),
|
|
109
199
|
encoding='utf-8') as file:
|
|
110
200
|
return file.read().strip()
|
|
111
201
|
except FileNotFoundError:
|
|
112
202
|
return None
|
|
113
203
|
|
|
114
204
|
|
|
115
|
-
@annotations.lru_cache(scope='request')
|
|
116
205
|
def sdk():
|
|
206
|
+
"""Create the Nebius SDK with the correct credentials.
|
|
207
|
+
|
|
208
|
+
The order of priority is:
|
|
209
|
+
1. Credentials file specified in workspace config, if set
|
|
210
|
+
2. IAM token file, if set
|
|
211
|
+
3. Default credentials path
|
|
212
|
+
"""
|
|
213
|
+
# 1. Check if credentials path is set in workspace config (highest priority)
|
|
214
|
+
workspace_cred_path = _get_workspace_credentials_path()
|
|
215
|
+
if workspace_cred_path is not None:
|
|
216
|
+
# Check if token is also available and warn
|
|
217
|
+
token = get_iam_token()
|
|
218
|
+
if token is not None:
|
|
219
|
+
logger.warning(
|
|
220
|
+
f'Both workspace credentials file ({workspace_cred_path}) and '
|
|
221
|
+
f'IAM token file ({iam_token_path()}) are available. Using '
|
|
222
|
+
'workspace credentials file.')
|
|
223
|
+
return _sdk(None, workspace_cred_path)
|
|
224
|
+
|
|
225
|
+
# 2. Check for IAM token file (second priority)
|
|
117
226
|
token = get_iam_token()
|
|
118
227
|
if token is not None:
|
|
119
|
-
return
|
|
120
|
-
|
|
121
|
-
|
|
228
|
+
return _sdk(token, None)
|
|
229
|
+
|
|
230
|
+
# 3. Fall back to default credentials path (lowest priority)
|
|
231
|
+
default_cred_path = _get_default_credentials_path()
|
|
232
|
+
return _sdk(None, default_cred_path)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@annotations.lru_cache(scope='request')
|
|
236
|
+
def _sdk(token: Optional[str], cred_path: Optional[str]):
|
|
237
|
+
# Exactly one of token or cred_path must be provided
|
|
238
|
+
assert (token is None) != (cred_path is None), (token, cred_path)
|
|
239
|
+
if token is not None:
|
|
240
|
+
return nebius.sdk.SDK(credentials=token, domain=api_domain())
|
|
241
|
+
if cred_path is not None:
|
|
242
|
+
return nebius.sdk.SDK(
|
|
243
|
+
credentials_file_name=os.path.expanduser(cred_path),
|
|
244
|
+
domain=api_domain(),
|
|
245
|
+
)
|
|
246
|
+
raise ValueError('Either token or credentials file path must be provided')
|
|
122
247
|
|
|
123
248
|
|
|
124
249
|
def get_nebius_credentials(boto3_session):
|
|
@@ -196,3 +321,21 @@ def botocore_exceptions():
|
|
|
196
321
|
# pylint: disable=import-outside-toplevel
|
|
197
322
|
from botocore import exceptions
|
|
198
323
|
return exceptions
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def get_credential_file_paths() -> List[str]:
|
|
327
|
+
"""Get the list of credential file paths based on current configuration."""
|
|
328
|
+
paths = {
|
|
329
|
+
# Always include tenant ID and IAM token paths
|
|
330
|
+
tenant_id_path(),
|
|
331
|
+
iam_token_path(),
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
# Add workspace-specific credentials path if set
|
|
335
|
+
workspace_cred_path = _get_workspace_credentials_path()
|
|
336
|
+
if workspace_cred_path is not None:
|
|
337
|
+
paths.add(workspace_cred_path)
|
|
338
|
+
# Always add default path in case it's needed for fallback
|
|
339
|
+
paths.add(_get_default_credentials_path())
|
|
340
|
+
|
|
341
|
+
return list(paths)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Prime Intellect cloud adaptor."""
|
sky/adaptors/runpod.py
CHANGED
|
@@ -1,8 +1,76 @@
|
|
|
1
1
|
"""RunPod cloud adaptor."""
|
|
2
2
|
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
3
7
|
from sky.adaptors import common
|
|
4
8
|
|
|
5
9
|
runpod = common.LazyImport(
|
|
6
10
|
'runpod',
|
|
7
11
|
import_error_message='Failed to import dependencies for RunPod. '
|
|
8
12
|
'Try running: pip install "skypilot[runpod]"')
|
|
13
|
+
|
|
14
|
+
# Lazy imports
|
|
15
|
+
requests = common.LazyImport('requests')
|
|
16
|
+
|
|
17
|
+
_REST_BASE = 'https://rest.runpod.io/v1'
|
|
18
|
+
_MAX_RETRIES = 3
|
|
19
|
+
_TIMEOUT = 10
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_api_key() -> str:
|
|
23
|
+
api_key = getattr(runpod, 'api_key', None)
|
|
24
|
+
if not api_key:
|
|
25
|
+
# Fallback to env if SDK global not set
|
|
26
|
+
api_key = os.environ.get('RUNPOD_API_KEY')
|
|
27
|
+
if not api_key:
|
|
28
|
+
raise RuntimeError(
|
|
29
|
+
'RunPod API key is not set. Please set runpod.api_key '
|
|
30
|
+
'or RUNPOD_API_KEY.')
|
|
31
|
+
return str(api_key)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def rest_request(method: str,
|
|
35
|
+
path: str,
|
|
36
|
+
json: Optional[Dict[str, Any]] = None) -> Any:
|
|
37
|
+
url = f'{_REST_BASE}{path}'
|
|
38
|
+
headers = {
|
|
39
|
+
'Authorization': f'Bearer {_get_api_key()}',
|
|
40
|
+
'Content-Type': 'application/json',
|
|
41
|
+
}
|
|
42
|
+
attempt = 0
|
|
43
|
+
while True:
|
|
44
|
+
attempt += 1
|
|
45
|
+
try:
|
|
46
|
+
resp = requests.request(method,
|
|
47
|
+
url,
|
|
48
|
+
headers=headers,
|
|
49
|
+
json=json,
|
|
50
|
+
timeout=_TIMEOUT)
|
|
51
|
+
except Exception as e: # pylint: disable=broad-except
|
|
52
|
+
# Retry on transient network errors
|
|
53
|
+
if attempt >= _MAX_RETRIES:
|
|
54
|
+
raise RuntimeError(f'RunPod REST network error: {e}') from e
|
|
55
|
+
time.sleep(1)
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Retry on 5xx and 429
|
|
59
|
+
if resp.status_code >= 500 or resp.status_code == 429:
|
|
60
|
+
if attempt >= _MAX_RETRIES:
|
|
61
|
+
raise RuntimeError(
|
|
62
|
+
f'RunPod REST error {resp.status_code}: {resp.text}')
|
|
63
|
+
time.sleep(1)
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
if resp.status_code >= 400:
|
|
67
|
+
# Non-retryable client error
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f'RunPod REST error {resp.status_code}: {resp.text}')
|
|
70
|
+
|
|
71
|
+
if resp.text:
|
|
72
|
+
try:
|
|
73
|
+
return resp.json()
|
|
74
|
+
except Exception: # pylint: disable=broad-except
|
|
75
|
+
return resp.text
|
|
76
|
+
return None
|