skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,12 @@
1
1
  """Kubernetes adaptors"""
2
+ import functools
2
3
  import logging
3
4
  import os
5
+ import platform
4
6
  from typing import Any, Callable, Optional, Set
5
7
 
8
+ from sky import sky_logging
6
9
  from sky.adaptors import common
7
- from sky.sky_logging import set_logging_level
8
10
  from sky.utils import annotations
9
11
  from sky.utils import common_utils
10
12
  from sky.utils import ux_utils
@@ -13,12 +15,23 @@ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
13
15
  'Try running: pip install "skypilot[kubernetes]"')
14
16
  kubernetes = common.LazyImport('kubernetes',
15
17
  import_error_message=_IMPORT_ERROR_MESSAGE)
18
+ models = common.LazyImport('kubernetes.client.models',
19
+ import_error_message=_IMPORT_ERROR_MESSAGE)
16
20
  urllib3 = common.LazyImport('urllib3',
17
21
  import_error_message=_IMPORT_ERROR_MESSAGE)
22
+ dateutil_parser = common.LazyImport('dateutil.parser',
23
+ import_error_message=_IMPORT_ERROR_MESSAGE)
18
24
 
19
25
  # Timeout to use for API calls
20
26
  API_TIMEOUT = 5
21
27
 
28
+ # Check if KUBECONFIG is set, and use it if it is.
29
+ DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
30
+ # From kubernetes package, keep a copy here to avoid actually importing
31
+ # kubernetes package when parsing the KUBECONFIG env var to do credential
32
+ # file mounts.
33
+ ENV_KUBECONFIG_PATH_SEPARATOR = ';' if platform.system() == 'Windows' else ':'
34
+
22
35
  DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
23
36
  # The name for the environment variable that stores the in-cluster context name
24
37
  # for Kubernetes clusters. This is used to associate a name with the current
@@ -26,6 +39,8 @@ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
26
39
  # set to DEFAULT_IN_CLUSTER_REGION.
27
40
  IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
28
41
 
42
+ logger = sky_logging.init_logger(__name__)
43
+
29
44
 
30
45
  def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
31
46
  for attr_name in dir(obj):
@@ -43,7 +58,7 @@ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
43
58
  return obj
44
59
 
45
60
 
46
- def _api_logging_decorator(logger: str, level: int):
61
+ def _api_logging_decorator(logger_src: str, level: int):
47
62
  """Decorator to set logging level for API calls.
48
63
 
49
64
  This is used to suppress the verbose logging from urllib3 when calls to the
@@ -54,7 +69,9 @@ def _api_logging_decorator(logger: str, level: int):
54
69
 
55
70
  def wrapped(*args, **kwargs):
56
71
  obj = api(*args, **kwargs)
57
- _decorate_methods(obj, set_logging_level(logger, level), 'api_log')
72
+ _decorate_methods(obj,
73
+ sky_logging.set_logging_level(logger_src, level),
74
+ 'api_log')
58
75
  return obj
59
76
 
60
77
  return wrapped
@@ -62,31 +79,61 @@ def _api_logging_decorator(logger: str, level: int):
62
79
  return decorated_api
63
80
 
64
81
 
82
+ def _get_config_file() -> str:
83
+ # Kubernetes load the kubeconfig from the KUBECONFIG env var on
84
+ # package initialization. So we have to reload the KUBECOFNIG env var
85
+ # everytime in case the KUBECONFIG env var is changed.
86
+ return os.environ.get('KUBECONFIG', '~/.kube/config')
87
+
88
+
65
89
  def _load_config(context: Optional[str] = None):
66
90
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
67
91
 
68
92
  def _load_config_from_kubeconfig(context: Optional[str] = None):
69
93
  try:
70
- kubernetes.config.load_kube_config(context=context)
94
+ kubernetes.config.load_kube_config(config_file=_get_config_file(),
95
+ context=context)
71
96
  except kubernetes.config.config_exception.ConfigException as e:
72
97
  suffix = common_utils.format_exception(e, use_bracket=True)
73
98
  context_name = '(current-context)' if context is None else context
99
+ is_ssh_node_pool = False
100
+ if context_name.startswith('ssh-'):
101
+ context_name = common_utils.removeprefix(context_name, 'ssh-')
102
+ is_ssh_node_pool = True
74
103
  # Check if exception was due to no current-context
75
104
  if 'Expected key current-context' in str(e):
76
- err_str = ('Failed to load Kubernetes configuration for '
77
- f'{context_name!r}. '
78
- 'Kubeconfig does not contain any valid context(s).'
79
- f'\n{suffix}\n'
80
- ' If you were running a local Kubernetes '
81
- 'cluster, run `sky local up` to start the cluster.')
105
+ if is_ssh_node_pool:
106
+ context_name = common_utils.removeprefix(
107
+ context_name, 'ssh-')
108
+ err_str = ('Failed to load SSH Node Pool configuration for '
109
+ f'{context_name!r}.\n'
110
+ ' Run `sky ssh up --infra {context_name}` to '
111
+ 'set up or repair the cluster.')
112
+ else:
113
+ err_str = (
114
+ 'Failed to load Kubernetes configuration for '
115
+ f'{context_name!r}. '
116
+ 'Kubeconfig does not contain any valid context(s).'
117
+ f'\n{suffix}\n'
118
+ ' If you were running a local Kubernetes '
119
+ 'cluster, run `sky local up` to start the cluster.')
82
120
  else:
83
121
  kubeconfig_path = os.environ.get('KUBECONFIG', '~/.kube/config')
84
- err_str = (
85
- f'Failed to load Kubernetes configuration for '
86
- f'{context_name!r}. Please check if your kubeconfig file '
87
- f'exists at {kubeconfig_path} and is valid.\n{suffix}')
88
- err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
89
- if context is None: # kubernetes defaults to current-context.
122
+ if is_ssh_node_pool:
123
+ err_str = (
124
+ f'Failed to load SSH Node Pool configuration for '
125
+ f'{context_name!r}. Run `sky ssh up --infra '
126
+ f'{context_name}` to set up or repair the cluster.')
127
+ else:
128
+ err_str = (
129
+ 'Failed to load Kubernetes configuration for '
130
+ f'{context_name!r}. Please check if your kubeconfig '
131
+ f'file exists at {kubeconfig_path} and is valid.'
132
+ f'\n{suffix}\n')
133
+ if is_ssh_node_pool:
134
+ err_str += (f'\nTo disable SSH Node Pool {context_name!r}: '
135
+ 'run `sky check`.')
136
+ else:
90
137
  err_str += (
91
138
  '\nHint: Kubernetes attempted to query the current-context '
92
139
  'set in kubeconfig. Check if the current-context is valid.')
@@ -100,8 +147,11 @@ def _load_config(context: Optional[str] = None):
100
147
  # show up in SkyPilot tasks. For now, we work around by using
101
148
  # DNS name instead of environment variables.
102
149
  # See issue: https://github.com/skypilot-org/skypilot/issues/2287
103
- os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
104
- os.environ['KUBERNETES_SERVICE_PORT'] = '443'
150
+ # Only set if not already present (preserving existing values)
151
+ if 'KUBERNETES_SERVICE_HOST' not in os.environ:
152
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
153
+ if 'KUBERNETES_SERVICE_PORT' not in os.environ:
154
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
105
155
  kubernetes.config.load_incluster_config()
106
156
  except kubernetes.config.config_exception.ConfigException:
107
157
  _load_config_from_kubeconfig()
@@ -109,8 +159,65 @@ def _load_config(context: Optional[str] = None):
109
159
  _load_config_from_kubeconfig(context)
110
160
 
111
161
 
162
+ def list_kube_config_contexts():
163
+ return kubernetes.config.list_kube_config_contexts(_get_config_file())
164
+
165
+
166
+ class ClientWrapper:
167
+ """Wrapper around the kubernetes API clients.
168
+
169
+ This is needed because we cache kubernetes.client.ApiClient and other typed
170
+ clients (e.g. kubernetes.client.CoreV1Api) and lru_cache.cache_clear() does
171
+ not call close() on the client to cleanup external resources like
172
+ semaphores. This decorator wraps the client with __del__ to ensure the
173
+ external state of kubernetes clients are properly cleaned up on GC.
174
+ """
175
+
176
+ def __init__(self, client):
177
+ self._client = client
178
+
179
+ def __getattr__(self, name):
180
+ """Delegate to the underlying client"""
181
+ return getattr(self._client, name)
182
+
183
+ def __del__(self):
184
+ """Clean up the underlying client"""
185
+ try:
186
+ real_client = None
187
+ if isinstance(self._client, kubernetes.client.ApiClient):
188
+ real_client = self._client
189
+ elif isinstance(self._client, kubernetes.watch.Watch):
190
+ real_client = getattr(self._client, '_api_client', None)
191
+ else:
192
+ # Otherwise, the client is a typed client, the typed client
193
+ # is generated by codegen and all of them should have an
194
+ # 'api_client' attribute referring to the real client.
195
+ real_client = getattr(self._client, 'api_client', None)
196
+ if real_client is not None:
197
+ real_client.close()
198
+ else:
199
+ # logger may already be cleaned up during __del__ at shutdown
200
+ if logger is not None:
201
+ logger.debug(f'No client found for {self._client}')
202
+ except Exception as e: # pylint: disable=broad-except
203
+ if logger is not None:
204
+ logger.debug(f'Error closing Kubernetes client: {e}')
205
+
206
+
207
+ def wrap_kubernetes_client(func):
208
+ """Wraps kubernetes API clients for proper cleanup."""
209
+
210
+ @functools.wraps(func)
211
+ def wrapper(*args, **kwargs):
212
+ obj = func(*args, **kwargs)
213
+ return ClientWrapper(obj)
214
+
215
+ return wrapper
216
+
217
+
112
218
  @_api_logging_decorator('urllib3', logging.ERROR)
113
219
  @annotations.lru_cache(scope='request')
220
+ @wrap_kubernetes_client
114
221
  def core_api(context: Optional[str] = None):
115
222
  _load_config(context)
116
223
  return kubernetes.client.CoreV1Api()
@@ -118,6 +225,15 @@ def core_api(context: Optional[str] = None):
118
225
 
119
226
  @_api_logging_decorator('urllib3', logging.ERROR)
120
227
  @annotations.lru_cache(scope='request')
228
+ @wrap_kubernetes_client
229
+ def storage_api(context: Optional[str] = None):
230
+ _load_config(context)
231
+ return kubernetes.client.StorageV1Api()
232
+
233
+
234
+ @_api_logging_decorator('urllib3', logging.ERROR)
235
+ @annotations.lru_cache(scope='request')
236
+ @wrap_kubernetes_client
121
237
  def auth_api(context: Optional[str] = None):
122
238
  _load_config(context)
123
239
  return kubernetes.client.RbacAuthorizationV1Api()
@@ -125,6 +241,7 @@ def auth_api(context: Optional[str] = None):
125
241
 
126
242
  @_api_logging_decorator('urllib3', logging.ERROR)
127
243
  @annotations.lru_cache(scope='request')
244
+ @wrap_kubernetes_client
128
245
  def networking_api(context: Optional[str] = None):
129
246
  _load_config(context)
130
247
  return kubernetes.client.NetworkingV1Api()
@@ -132,6 +249,7 @@ def networking_api(context: Optional[str] = None):
132
249
 
133
250
  @_api_logging_decorator('urllib3', logging.ERROR)
134
251
  @annotations.lru_cache(scope='request')
252
+ @wrap_kubernetes_client
135
253
  def custom_objects_api(context: Optional[str] = None):
136
254
  _load_config(context)
137
255
  return kubernetes.client.CustomObjectsApi()
@@ -139,6 +257,7 @@ def custom_objects_api(context: Optional[str] = None):
139
257
 
140
258
  @_api_logging_decorator('urllib3', logging.ERROR)
141
259
  @annotations.lru_cache(scope='global')
260
+ @wrap_kubernetes_client
142
261
  def node_api(context: Optional[str] = None):
143
262
  _load_config(context)
144
263
  return kubernetes.client.NodeV1Api()
@@ -146,6 +265,7 @@ def node_api(context: Optional[str] = None):
146
265
 
147
266
  @_api_logging_decorator('urllib3', logging.ERROR)
148
267
  @annotations.lru_cache(scope='request')
268
+ @wrap_kubernetes_client
149
269
  def apps_api(context: Optional[str] = None):
150
270
  _load_config(context)
151
271
  return kubernetes.client.AppsV1Api()
@@ -153,6 +273,7 @@ def apps_api(context: Optional[str] = None):
153
273
 
154
274
  @_api_logging_decorator('urllib3', logging.ERROR)
155
275
  @annotations.lru_cache(scope='request')
276
+ @wrap_kubernetes_client
156
277
  def batch_api(context: Optional[str] = None):
157
278
  _load_config(context)
158
279
  return kubernetes.client.BatchV1Api()
@@ -160,6 +281,7 @@ def batch_api(context: Optional[str] = None):
160
281
 
161
282
  @_api_logging_decorator('urllib3', logging.ERROR)
162
283
  @annotations.lru_cache(scope='request')
284
+ @wrap_kubernetes_client
163
285
  def api_client(context: Optional[str] = None):
164
286
  _load_config(context)
165
287
  return kubernetes.client.ApiClient()
@@ -167,6 +289,15 @@ def api_client(context: Optional[str] = None):
167
289
 
168
290
  @_api_logging_decorator('urllib3', logging.ERROR)
169
291
  @annotations.lru_cache(scope='request')
292
+ @wrap_kubernetes_client
293
+ def custom_resources_api(context: Optional[str] = None):
294
+ _load_config(context)
295
+ return kubernetes.client.CustomObjectsApi()
296
+
297
+
298
+ @_api_logging_decorator('urllib3', logging.ERROR)
299
+ @annotations.lru_cache(scope='request')
300
+ @wrap_kubernetes_client
170
301
  def watch(context: Optional[str] = None):
171
302
  _load_config(context)
172
303
  return kubernetes.watch.Watch()
sky/adaptors/nebius.py CHANGED
@@ -1,19 +1,106 @@
1
1
  """Nebius cloud adaptor."""
2
+ import asyncio
2
3
  import os
3
4
  import threading
5
+ from typing import Any, Awaitable, List, Optional
4
6
 
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
5
9
  from sky.adaptors import common
6
10
  from sky.utils import annotations
7
11
  from sky.utils import ux_utils
8
12
 
9
- NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
10
- NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
11
- NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
12
- NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
13
- NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
14
- NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
15
- NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
16
- NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
13
+ # Default read timeout for nebius SDK
14
+ READ_TIMEOUT = 10
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ _loop_lock = threading.Lock()
19
+ _loop = None
20
+
21
+
22
+ def _get_event_loop() -> asyncio.AbstractEventLoop:
23
+ """Get event loop for nebius sdk."""
24
+ global _loop
25
+
26
+ if _loop is not None:
27
+ return _loop
28
+
29
+ with _loop_lock:
30
+ if _loop is None:
31
+ # Create a new event loop in a dedicated thread
32
+ _loop = asyncio.new_event_loop()
33
+ threading.Thread(target=_loop.run_forever, daemon=True).start()
34
+
35
+ return _loop
36
+
37
+
38
+ def sync_call(awaitable: Awaitable[Any]) -> Any:
39
+ """Synchronously run an awaitable in coroutine.
40
+
41
+ This wrapper is used to workaround:
42
+ https://github.com/nebius/pysdk/issues/76
43
+
44
+ Uses a dedicated background event loop to avoid conflicts
45
+ with existing asyncio contexts and prevent BlockingIOError.
46
+ """
47
+ loop = _get_event_loop()
48
+ future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
49
+ return future.result()
50
+
51
+
52
+ async def _coro(awaitable: Awaitable[Any]) -> Any:
53
+ """Wrapper coroutine for awaitable."""
54
+ return await awaitable
55
+
56
+
57
+ def tenant_id_path() -> str:
58
+ return '~/.nebius/NEBIUS_TENANT_ID.txt'
59
+
60
+
61
+ def iam_token_path() -> str:
62
+ return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
63
+
64
+
65
+ def domain_path() -> str:
66
+ return '~/.nebius/NEBIUS_DOMAIN.txt'
67
+
68
+
69
+ def credentials_path() -> str:
70
+ workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
71
+ 'credentials_file_path', None)
72
+ if workspace_path is not None:
73
+ return workspace_path
74
+ return _get_default_credentials_path()
75
+
76
+
77
+ def _get_workspace_credentials_path() -> Optional[str]:
78
+ """Get credentials path if explicitly set in workspace config."""
79
+ workspace_cred_path = skypilot_config.get_workspace_cloud('nebius').get(
80
+ 'credentials_file_path', None)
81
+ return workspace_cred_path
82
+
83
+
84
+ def _get_default_credentials_path() -> str:
85
+ """Get the default credentials path."""
86
+ return '~/.nebius/credentials.json'
87
+
88
+
89
+ def api_domain() -> Optional[str]:
90
+ domain_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
91
+ 'domain', None)
92
+ if domain_in_ws_config is not None:
93
+ return domain_in_ws_config
94
+ domain_in_config = skypilot_config.get_effective_region_config(
95
+ cloud='nebius', region=None, keys=('domain',), default_value=None)
96
+ if domain_in_config is not None:
97
+ return domain_in_config
98
+ try:
99
+ with open(os.path.expanduser(domain_path()), encoding='utf-8') as file:
100
+ return file.read().strip()
101
+ except FileNotFoundError:
102
+ return None
103
+
17
104
 
18
105
  DEFAULT_REGION = 'eu-north1'
19
106
 
@@ -49,7 +136,9 @@ SKY_CHECK_NAME = 'Nebius (for Nebius Object Storae)'
49
136
 
50
137
 
51
138
  def request_error():
52
- return nebius.aio.service_error.RequestError
139
+ # pylint: disable=import-outside-toplevel
140
+ from nebius.aio import service_error
141
+ return service_error.RequestError
53
142
 
54
143
 
55
144
  def compute():
@@ -64,6 +153,12 @@ def iam():
64
153
  return iam_v1
65
154
 
66
155
 
156
+ def billing():
157
+ # pylint: disable=import-outside-toplevel
158
+ from nebius.api.nebius.billing import v1alpha1 as billing_v1alpha1
159
+ return billing_v1alpha1
160
+
161
+
67
162
  def nebius_common():
68
163
  # pylint: disable=import-outside-toplevel
69
164
  from nebius.api.nebius.common import v1 as common_v1
@@ -76,49 +171,79 @@ def vpc():
76
171
  return vpc_v1
77
172
 
78
173
 
79
- @annotations.lru_cache(scope='request')
80
174
  def get_iam_token():
81
175
  try:
82
- with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
176
+ with open(os.path.expanduser(iam_token_path()),
83
177
  encoding='utf-8') as file:
84
178
  return file.read().strip()
85
179
  except FileNotFoundError:
86
180
  return None
87
181
 
88
182
 
89
- @annotations.lru_cache(scope='request')
90
183
  def is_token_or_cred_file_exist():
91
- return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
92
- os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
93
-
94
-
95
- @annotations.lru_cache(scope='request')
96
- def get_project_id():
97
- try:
98
- with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
99
- encoding='utf-8') as file:
100
- return file.read().strip()
101
- except FileNotFoundError:
102
- return None
184
+ return (os.path.exists(os.path.expanduser(iam_token_path())) or
185
+ os.path.exists(os.path.expanduser(credentials_path())))
103
186
 
104
187
 
105
- @annotations.lru_cache(scope='request')
106
188
  def get_tenant_id():
189
+ tenant_id_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
190
+ 'tenant_id', None)
191
+ if tenant_id_in_ws_config is not None:
192
+ return tenant_id_in_ws_config
193
+ tenant_id_in_config = skypilot_config.get_effective_region_config(
194
+ cloud='nebius', region=None, keys=('tenant_id',), default_value=None)
195
+ if tenant_id_in_config is not None:
196
+ return tenant_id_in_config
107
197
  try:
108
- with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
198
+ with open(os.path.expanduser(tenant_id_path()),
109
199
  encoding='utf-8') as file:
110
200
  return file.read().strip()
111
201
  except FileNotFoundError:
112
202
  return None
113
203
 
114
204
 
115
- @annotations.lru_cache(scope='request')
116
205
  def sdk():
206
+ """Create the Nebius SDK with the correct credentials.
207
+
208
+ The order of priority is:
209
+ 1. Credentials file specified in workspace config, if set
210
+ 2. IAM token file, if set
211
+ 3. Default credentials path
212
+ """
213
+ # 1. Check if credentials path is set in workspace config (highest priority)
214
+ workspace_cred_path = _get_workspace_credentials_path()
215
+ if workspace_cred_path is not None:
216
+ # Check if token is also available and warn
217
+ token = get_iam_token()
218
+ if token is not None:
219
+ logger.warning(
220
+ f'Both workspace credentials file ({workspace_cred_path}) and '
221
+ f'IAM token file ({iam_token_path()}) are available. Using '
222
+ 'workspace credentials file.')
223
+ return _sdk(None, workspace_cred_path)
224
+
225
+ # 2. Check for IAM token file (second priority)
117
226
  token = get_iam_token()
118
227
  if token is not None:
119
- return nebius.sdk.SDK(credentials=token)
120
- return nebius.sdk.SDK(
121
- credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
228
+ return _sdk(token, None)
229
+
230
+ # 3. Fall back to default credentials path (lowest priority)
231
+ default_cred_path = _get_default_credentials_path()
232
+ return _sdk(None, default_cred_path)
233
+
234
+
235
+ @annotations.lru_cache(scope='request')
236
+ def _sdk(token: Optional[str], cred_path: Optional[str]):
237
+ # Exactly one of token or cred_path must be provided
238
+ assert (token is None) != (cred_path is None), (token, cred_path)
239
+ if token is not None:
240
+ return nebius.sdk.SDK(credentials=token, domain=api_domain())
241
+ if cred_path is not None:
242
+ return nebius.sdk.SDK(
243
+ credentials_file_name=os.path.expanduser(cred_path),
244
+ domain=api_domain(),
245
+ )
246
+ raise ValueError('Either token or credentials file path must be provided')
122
247
 
123
248
 
124
249
  def get_nebius_credentials(boto3_session):
@@ -196,3 +321,21 @@ def botocore_exceptions():
196
321
  # pylint: disable=import-outside-toplevel
197
322
  from botocore import exceptions
198
323
  return exceptions
324
+
325
+
326
+ def get_credential_file_paths() -> List[str]:
327
+ """Get the list of credential file paths based on current configuration."""
328
+ paths = {
329
+ # Always include tenant ID and IAM token paths
330
+ tenant_id_path(),
331
+ iam_token_path(),
332
+ }
333
+
334
+ # Add workspace-specific credentials path if set
335
+ workspace_cred_path = _get_workspace_credentials_path()
336
+ if workspace_cred_path is not None:
337
+ paths.add(workspace_cred_path)
338
+ # Always add default path in case it's needed for fallback
339
+ paths.add(_get_default_credentials_path())
340
+
341
+ return list(paths)
@@ -0,0 +1 @@
1
+ """Prime Intellect cloud adaptor."""
sky/adaptors/runpod.py CHANGED
@@ -1,8 +1,76 @@
1
1
  """RunPod cloud adaptor."""
2
2
 
3
+ import os
4
+ import time
5
+ from typing import Any, Dict, Optional
6
+
3
7
  from sky.adaptors import common
4
8
 
5
9
  runpod = common.LazyImport(
6
10
  'runpod',
7
11
  import_error_message='Failed to import dependencies for RunPod. '
8
12
  'Try running: pip install "skypilot[runpod]"')
13
+
14
+ # Lazy imports
15
+ requests = common.LazyImport('requests')
16
+
17
+ _REST_BASE = 'https://rest.runpod.io/v1'
18
+ _MAX_RETRIES = 3
19
+ _TIMEOUT = 10
20
+
21
+
22
+ def _get_api_key() -> str:
23
+ api_key = getattr(runpod, 'api_key', None)
24
+ if not api_key:
25
+ # Fallback to env if SDK global not set
26
+ api_key = os.environ.get('RUNPOD_API_KEY')
27
+ if not api_key:
28
+ raise RuntimeError(
29
+ 'RunPod API key is not set. Please set runpod.api_key '
30
+ 'or RUNPOD_API_KEY.')
31
+ return str(api_key)
32
+
33
+
34
+ def rest_request(method: str,
35
+ path: str,
36
+ json: Optional[Dict[str, Any]] = None) -> Any:
37
+ url = f'{_REST_BASE}{path}'
38
+ headers = {
39
+ 'Authorization': f'Bearer {_get_api_key()}',
40
+ 'Content-Type': 'application/json',
41
+ }
42
+ attempt = 0
43
+ while True:
44
+ attempt += 1
45
+ try:
46
+ resp = requests.request(method,
47
+ url,
48
+ headers=headers,
49
+ json=json,
50
+ timeout=_TIMEOUT)
51
+ except Exception as e: # pylint: disable=broad-except
52
+ # Retry on transient network errors
53
+ if attempt >= _MAX_RETRIES:
54
+ raise RuntimeError(f'RunPod REST network error: {e}') from e
55
+ time.sleep(1)
56
+ continue
57
+
58
+ # Retry on 5xx and 429
59
+ if resp.status_code >= 500 or resp.status_code == 429:
60
+ if attempt >= _MAX_RETRIES:
61
+ raise RuntimeError(
62
+ f'RunPod REST error {resp.status_code}: {resp.text}')
63
+ time.sleep(1)
64
+ continue
65
+
66
+ if resp.status_code >= 400:
67
+ # Non-retryable client error
68
+ raise RuntimeError(
69
+ f'RunPod REST error {resp.status_code}: {resp.text}')
70
+
71
+ if resp.text:
72
+ try:
73
+ return resp.json()
74
+ except Exception: # pylint: disable=broad-except
75
+ return resp.text
76
+ return None