skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  """Kubernetes adaptors"""
2
+ import functools
2
3
  import logging
3
4
  import os
5
+ import platform
4
6
  from typing import Any, Callable, Optional, Set
5
7
 
8
+ from sky import sky_logging
6
9
  from sky.adaptors import common
7
- from sky.sky_logging import set_logging_level
8
10
  from sky.utils import annotations
9
11
  from sky.utils import common_utils
10
12
  from sky.utils import ux_utils
@@ -13,12 +15,23 @@ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
13
15
  'Try running: pip install "skypilot[kubernetes]"')
14
16
  kubernetes = common.LazyImport('kubernetes',
15
17
  import_error_message=_IMPORT_ERROR_MESSAGE)
18
+ models = common.LazyImport('kubernetes.client.models',
19
+ import_error_message=_IMPORT_ERROR_MESSAGE)
16
20
  urllib3 = common.LazyImport('urllib3',
17
21
  import_error_message=_IMPORT_ERROR_MESSAGE)
22
+ dateutil_parser = common.LazyImport('dateutil.parser',
23
+ import_error_message=_IMPORT_ERROR_MESSAGE)
18
24
 
19
25
  # Timeout to use for API calls
20
26
  API_TIMEOUT = 5
21
27
 
28
+ # Check if KUBECONFIG is set, and use it if it is.
29
+ DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
30
+ # From kubernetes package, keep a copy here to avoid actually importing
31
+ # kubernetes package when parsing the KUBECONFIG env var to do credential
32
+ # file mounts.
33
+ ENV_KUBECONFIG_PATH_SEPARATOR = ';' if platform.system() == 'Windows' else ':'
34
+
22
35
  DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
23
36
  # The name for the environment variable that stores the in-cluster context name
24
37
  # for Kubernetes clusters. This is used to associate a name with the current
@@ -26,6 +39,8 @@ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
26
39
  # set to DEFAULT_IN_CLUSTER_REGION.
27
40
  IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'
28
41
 
42
+ logger = sky_logging.init_logger(__name__)
43
+
29
44
 
30
45
  def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
31
46
  for attr_name in dir(obj):
@@ -43,7 +58,7 @@ def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
43
58
  return obj
44
59
 
45
60
 
46
- def _api_logging_decorator(logger: str, level: int):
61
+ def _api_logging_decorator(logger_src: str, level: int):
47
62
  """Decorator to set logging level for API calls.
48
63
 
49
64
  This is used to suppress the verbose logging from urllib3 when calls to the
@@ -54,7 +69,9 @@ def _api_logging_decorator(logger: str, level: int):
54
69
 
55
70
  def wrapped(*args, **kwargs):
56
71
  obj = api(*args, **kwargs)
57
- _decorate_methods(obj, set_logging_level(logger, level), 'api_log')
72
+ _decorate_methods(obj,
73
+ sky_logging.set_logging_level(logger_src, level),
74
+ 'api_log')
58
75
  return obj
59
76
 
60
77
  return wrapped
@@ -62,31 +79,61 @@ def _api_logging_decorator(logger: str, level: int):
62
79
  return decorated_api
63
80
 
64
81
 
82
+ def _get_config_file() -> str:
83
+ # Kubernetes load the kubeconfig from the KUBECONFIG env var on
84
+ # package initialization. So we have to reload the KUBECOFNIG env var
85
+ # everytime in case the KUBECONFIG env var is changed.
86
+ return os.environ.get('KUBECONFIG', '~/.kube/config')
87
+
88
+
65
89
  def _load_config(context: Optional[str] = None):
66
90
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
67
91
 
68
92
  def _load_config_from_kubeconfig(context: Optional[str] = None):
69
93
  try:
70
- kubernetes.config.load_kube_config(context=context)
94
+ kubernetes.config.load_kube_config(config_file=_get_config_file(),
95
+ context=context)
71
96
  except kubernetes.config.config_exception.ConfigException as e:
72
97
  suffix = common_utils.format_exception(e, use_bracket=True)
73
98
  context_name = '(current-context)' if context is None else context
99
+ is_ssh_node_pool = False
100
+ if context_name.startswith('ssh-'):
101
+ context_name = common_utils.removeprefix(context_name, 'ssh-')
102
+ is_ssh_node_pool = True
74
103
  # Check if exception was due to no current-context
75
104
  if 'Expected key current-context' in str(e):
76
- err_str = ('Failed to load Kubernetes configuration for '
77
- f'{context_name!r}. '
78
- 'Kubeconfig does not contain any valid context(s).'
79
- f'\n{suffix}\n'
80
- ' If you were running a local Kubernetes '
81
- 'cluster, run `sky local up` to start the cluster.')
105
+ if is_ssh_node_pool:
106
+ context_name = common_utils.removeprefix(
107
+ context_name, 'ssh-')
108
+ err_str = ('Failed to load SSH Node Pool configuration for '
109
+ f'{context_name!r}.\n'
110
+ ' Run `sky ssh up --infra {context_name}` to '
111
+ 'set up or repair the cluster.')
112
+ else:
113
+ err_str = (
114
+ 'Failed to load Kubernetes configuration for '
115
+ f'{context_name!r}. '
116
+ 'Kubeconfig does not contain any valid context(s).'
117
+ f'\n{suffix}\n'
118
+ ' If you were running a local Kubernetes '
119
+ 'cluster, run `sky local up` to start the cluster.')
82
120
  else:
83
121
  kubeconfig_path = os.environ.get('KUBECONFIG', '~/.kube/config')
84
- err_str = (
85
- f'Failed to load Kubernetes configuration for '
86
- f'{context_name!r}. Please check if your kubeconfig file '
87
- f'exists at {kubeconfig_path} and is valid.\n{suffix}')
88
- err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
89
- if context is None: # kubernetes defaults to current-context.
122
+ if is_ssh_node_pool:
123
+ err_str = (
124
+ f'Failed to load SSH Node Pool configuration for '
125
+ f'{context_name!r}. Run `sky ssh up --infra '
126
+ f'{context_name}` to set up or repair the cluster.')
127
+ else:
128
+ err_str = (
129
+ 'Failed to load Kubernetes configuration for '
130
+ f'{context_name!r}. Please check if your kubeconfig '
131
+ f'file exists at {kubeconfig_path} and is valid.'
132
+ f'\n{suffix}\n')
133
+ if is_ssh_node_pool:
134
+ err_str += (f'\nTo disable SSH Node Pool {context_name!r}: '
135
+ 'run `sky check`.')
136
+ else:
90
137
  err_str += (
91
138
  '\nHint: Kubernetes attempted to query the current-context '
92
139
  'set in kubeconfig. Check if the current-context is valid.')
@@ -100,8 +147,11 @@ def _load_config(context: Optional[str] = None):
100
147
  # show up in SkyPilot tasks. For now, we work around by using
101
148
  # DNS name instead of environment variables.
102
149
  # See issue: https://github.com/skypilot-org/skypilot/issues/2287
103
- os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
104
- os.environ['KUBERNETES_SERVICE_PORT'] = '443'
150
+ # Only set if not already present (preserving existing values)
151
+ if 'KUBERNETES_SERVICE_HOST' not in os.environ:
152
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
153
+ if 'KUBERNETES_SERVICE_PORT' not in os.environ:
154
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
105
155
  kubernetes.config.load_incluster_config()
106
156
  except kubernetes.config.config_exception.ConfigException:
107
157
  _load_config_from_kubeconfig()
@@ -109,8 +159,65 @@ def _load_config(context: Optional[str] = None):
109
159
  _load_config_from_kubeconfig(context)
110
160
 
111
161
 
162
+ def list_kube_config_contexts():
163
+ return kubernetes.config.list_kube_config_contexts(_get_config_file())
164
+
165
+
166
+ class ClientWrapper:
167
+ """Wrapper around the kubernetes API clients.
168
+
169
+ This is needed because we cache kubernetes.client.ApiClient and other typed
170
+ clients (e.g. kubernetes.client.CoreV1Api) and lru_cache.cache_clear() does
171
+ not call close() on the client to cleanup external resources like
172
+ semaphores. This decorator wraps the client with __del__ to ensure the
173
+ external state of kubernetes clients are properly cleaned up on GC.
174
+ """
175
+
176
+ def __init__(self, client):
177
+ self._client = client
178
+
179
+ def __getattr__(self, name):
180
+ """Delegate to the underlying client"""
181
+ return getattr(self._client, name)
182
+
183
+ def __del__(self):
184
+ """Clean up the underlying client"""
185
+ try:
186
+ real_client = None
187
+ if isinstance(self._client, kubernetes.client.ApiClient):
188
+ real_client = self._client
189
+ elif isinstance(self._client, kubernetes.watch.Watch):
190
+ real_client = getattr(self._client, '_api_client', None)
191
+ else:
192
+ # Otherwise, the client is a typed client, the typed client
193
+ # is generated by codegen and all of them should have an
194
+ # 'api_client' attribute referring to the real client.
195
+ real_client = getattr(self._client, 'api_client', None)
196
+ if real_client is not None:
197
+ real_client.close()
198
+ else:
199
+ # logger may already be cleaned up during __del__ at shutdown
200
+ if logger is not None:
201
+ logger.debug(f'No client found for {self._client}')
202
+ except Exception as e: # pylint: disable=broad-except
203
+ if logger is not None:
204
+ logger.debug(f'Error closing Kubernetes client: {e}')
205
+
206
+
207
+ def wrap_kubernetes_client(func):
208
+ """Wraps kubernetes API clients for proper cleanup."""
209
+
210
+ @functools.wraps(func)
211
+ def wrapper(*args, **kwargs):
212
+ obj = func(*args, **kwargs)
213
+ return ClientWrapper(obj)
214
+
215
+ return wrapper
216
+
217
+
112
218
  @_api_logging_decorator('urllib3', logging.ERROR)
113
219
  @annotations.lru_cache(scope='request')
220
+ @wrap_kubernetes_client
114
221
  def core_api(context: Optional[str] = None):
115
222
  _load_config(context)
116
223
  return kubernetes.client.CoreV1Api()
@@ -118,6 +225,15 @@ def core_api(context: Optional[str] = None):
118
225
 
119
226
  @_api_logging_decorator('urllib3', logging.ERROR)
120
227
  @annotations.lru_cache(scope='request')
228
+ @wrap_kubernetes_client
229
+ def storage_api(context: Optional[str] = None):
230
+ _load_config(context)
231
+ return kubernetes.client.StorageV1Api()
232
+
233
+
234
+ @_api_logging_decorator('urllib3', logging.ERROR)
235
+ @annotations.lru_cache(scope='request')
236
+ @wrap_kubernetes_client
121
237
  def auth_api(context: Optional[str] = None):
122
238
  _load_config(context)
123
239
  return kubernetes.client.RbacAuthorizationV1Api()
@@ -125,6 +241,7 @@ def auth_api(context: Optional[str] = None):
125
241
 
126
242
  @_api_logging_decorator('urllib3', logging.ERROR)
127
243
  @annotations.lru_cache(scope='request')
244
+ @wrap_kubernetes_client
128
245
  def networking_api(context: Optional[str] = None):
129
246
  _load_config(context)
130
247
  return kubernetes.client.NetworkingV1Api()
@@ -132,6 +249,7 @@ def networking_api(context: Optional[str] = None):
132
249
 
133
250
  @_api_logging_decorator('urllib3', logging.ERROR)
134
251
  @annotations.lru_cache(scope='request')
252
+ @wrap_kubernetes_client
135
253
  def custom_objects_api(context: Optional[str] = None):
136
254
  _load_config(context)
137
255
  return kubernetes.client.CustomObjectsApi()
@@ -139,6 +257,7 @@ def custom_objects_api(context: Optional[str] = None):
139
257
 
140
258
  @_api_logging_decorator('urllib3', logging.ERROR)
141
259
  @annotations.lru_cache(scope='global')
260
+ @wrap_kubernetes_client
142
261
  def node_api(context: Optional[str] = None):
143
262
  _load_config(context)
144
263
  return kubernetes.client.NodeV1Api()
@@ -146,6 +265,7 @@ def node_api(context: Optional[str] = None):
146
265
 
147
266
  @_api_logging_decorator('urllib3', logging.ERROR)
148
267
  @annotations.lru_cache(scope='request')
268
+ @wrap_kubernetes_client
149
269
  def apps_api(context: Optional[str] = None):
150
270
  _load_config(context)
151
271
  return kubernetes.client.AppsV1Api()
@@ -153,6 +273,7 @@ def apps_api(context: Optional[str] = None):
153
273
 
154
274
  @_api_logging_decorator('urllib3', logging.ERROR)
155
275
  @annotations.lru_cache(scope='request')
276
+ @wrap_kubernetes_client
156
277
  def batch_api(context: Optional[str] = None):
157
278
  _load_config(context)
158
279
  return kubernetes.client.BatchV1Api()
@@ -160,6 +281,7 @@ def batch_api(context: Optional[str] = None):
160
281
 
161
282
  @_api_logging_decorator('urllib3', logging.ERROR)
162
283
  @annotations.lru_cache(scope='request')
284
+ @wrap_kubernetes_client
163
285
  def api_client(context: Optional[str] = None):
164
286
  _load_config(context)
165
287
  return kubernetes.client.ApiClient()
@@ -167,6 +289,15 @@ def api_client(context: Optional[str] = None):
167
289
 
168
290
  @_api_logging_decorator('urllib3', logging.ERROR)
169
291
  @annotations.lru_cache(scope='request')
292
+ @wrap_kubernetes_client
293
+ def custom_resources_api(context: Optional[str] = None):
294
+ _load_config(context)
295
+ return kubernetes.client.CustomObjectsApi()
296
+
297
+
298
+ @_api_logging_decorator('urllib3', logging.ERROR)
299
+ @annotations.lru_cache(scope='request')
300
+ @wrap_kubernetes_client
170
301
  def watch(context: Optional[str] = None):
171
302
  _load_config(context)
172
303
  return kubernetes.watch.Watch()
sky/adaptors/nebius.py CHANGED
@@ -1,17 +1,106 @@
1
1
  """Nebius cloud adaptor."""
2
+ import asyncio
2
3
  import os
3
4
  import threading
5
+ from typing import Any, Awaitable, List, Optional
4
6
 
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
5
9
  from sky.adaptors import common
6
10
  from sky.utils import annotations
7
11
  from sky.utils import ux_utils
8
12
 
9
- NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
10
- NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
11
- NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
12
- NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
13
- NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
14
- NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
13
+ # Default read timeout for nebius SDK
14
+ READ_TIMEOUT = 10
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ _loop_lock = threading.Lock()
19
+ _loop = None
20
+
21
+
22
+ def _get_event_loop() -> asyncio.AbstractEventLoop:
23
+ """Get event loop for nebius sdk."""
24
+ global _loop
25
+
26
+ if _loop is not None:
27
+ return _loop
28
+
29
+ with _loop_lock:
30
+ if _loop is None:
31
+ # Create a new event loop in a dedicated thread
32
+ _loop = asyncio.new_event_loop()
33
+ threading.Thread(target=_loop.run_forever, daemon=True).start()
34
+
35
+ return _loop
36
+
37
+
38
+ def sync_call(awaitable: Awaitable[Any]) -> Any:
39
+ """Synchronously run an awaitable in coroutine.
40
+
41
+ This wrapper is used to workaround:
42
+ https://github.com/nebius/pysdk/issues/76
43
+
44
+ Uses a dedicated background event loop to avoid conflicts
45
+ with existing asyncio contexts and prevent BlockingIOError.
46
+ """
47
+ loop = _get_event_loop()
48
+ future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
49
+ return future.result()
50
+
51
+
52
+ async def _coro(awaitable: Awaitable[Any]) -> Any:
53
+ """Wrapper coroutine for awaitable."""
54
+ return await awaitable
55
+
56
+
57
+ def tenant_id_path() -> str:
58
+ return '~/.nebius/NEBIUS_TENANT_ID.txt'
59
+
60
+
61
+ def iam_token_path() -> str:
62
+ return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
63
+
64
+
65
+ def domain_path() -> str:
66
+ return '~/.nebius/NEBIUS_DOMAIN.txt'
67
+
68
+
69
+ def credentials_path() -> str:
70
+ workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
71
+ 'credentials_file_path', None)
72
+ if workspace_path is not None:
73
+ return workspace_path
74
+ return _get_default_credentials_path()
75
+
76
+
77
+ def _get_workspace_credentials_path() -> Optional[str]:
78
+ """Get credentials path if explicitly set in workspace config."""
79
+ workspace_cred_path = skypilot_config.get_workspace_cloud('nebius').get(
80
+ 'credentials_file_path', None)
81
+ return workspace_cred_path
82
+
83
+
84
+ def _get_default_credentials_path() -> str:
85
+ """Get the default credentials path."""
86
+ return '~/.nebius/credentials.json'
87
+
88
+
89
+ def api_domain() -> Optional[str]:
90
+ domain_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
91
+ 'domain', None)
92
+ if domain_in_ws_config is not None:
93
+ return domain_in_ws_config
94
+ domain_in_config = skypilot_config.get_effective_region_config(
95
+ cloud='nebius', region=None, keys=('domain',), default_value=None)
96
+ if domain_in_config is not None:
97
+ return domain_in_config
98
+ try:
99
+ with open(os.path.expanduser(domain_path()), encoding='utf-8') as file:
100
+ return file.read().strip()
101
+ except FileNotFoundError:
102
+ return None
103
+
15
104
 
16
105
  DEFAULT_REGION = 'eu-north1'
17
106
 
@@ -62,6 +151,12 @@ def iam():
62
151
  return iam_v1
63
152
 
64
153
 
154
+ def billing():
155
+ # pylint: disable=import-outside-toplevel
156
+ from nebius.api.nebius.billing import v1alpha1 as billing_v1alpha1
157
+ return billing_v1alpha1
158
+
159
+
65
160
  def nebius_common():
66
161
  # pylint: disable=import-outside-toplevel
67
162
  from nebius.api.nebius.common import v1 as common_v1
@@ -74,39 +169,79 @@ def vpc():
74
169
  return vpc_v1
75
170
 
76
171
 
77
- @annotations.lru_cache(scope='request')
78
172
  def get_iam_token():
79
173
  try:
80
- with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
174
+ with open(os.path.expanduser(iam_token_path()),
81
175
  encoding='utf-8') as file:
82
176
  return file.read().strip()
83
177
  except FileNotFoundError:
84
178
  return None
85
179
 
86
180
 
87
- @annotations.lru_cache(scope='request')
88
181
  def is_token_or_cred_file_exist():
89
- return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
90
- os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
182
+ return (os.path.exists(os.path.expanduser(iam_token_path())) or
183
+ os.path.exists(os.path.expanduser(credentials_path())))
91
184
 
92
185
 
93
- @annotations.lru_cache(scope='request')
94
186
  def get_tenant_id():
187
+ tenant_id_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
188
+ 'tenant_id', None)
189
+ if tenant_id_in_ws_config is not None:
190
+ return tenant_id_in_ws_config
191
+ tenant_id_in_config = skypilot_config.get_effective_region_config(
192
+ cloud='nebius', region=None, keys=('tenant_id',), default_value=None)
193
+ if tenant_id_in_config is not None:
194
+ return tenant_id_in_config
95
195
  try:
96
- with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
196
+ with open(os.path.expanduser(tenant_id_path()),
97
197
  encoding='utf-8') as file:
98
198
  return file.read().strip()
99
199
  except FileNotFoundError:
100
200
  return None
101
201
 
102
202
 
103
- @annotations.lru_cache(scope='request')
104
203
  def sdk():
204
+ """Create the Nebius SDK with the correct credentials.
205
+
206
+ The order of priority is:
207
+ 1. Credentials file specified in workspace config, if set
208
+ 2. IAM token file, if set
209
+ 3. Default credentials path
210
+ """
211
+ # 1. Check if credentials path is set in workspace config (highest priority)
212
+ workspace_cred_path = _get_workspace_credentials_path()
213
+ if workspace_cred_path is not None:
214
+ # Check if token is also available and warn
215
+ token = get_iam_token()
216
+ if token is not None:
217
+ logger.warning(
218
+ f'Both workspace credentials file ({workspace_cred_path}) and '
219
+ f'IAM token file ({iam_token_path()}) are available. Using '
220
+ 'workspace credentials file.')
221
+ return _sdk(None, workspace_cred_path)
222
+
223
+ # 2. Check for IAM token file (second priority)
105
224
  token = get_iam_token()
106
225
  if token is not None:
107
- return nebius.sdk.SDK(credentials=token)
108
- return nebius.sdk.SDK(
109
- credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
226
+ return _sdk(token, None)
227
+
228
+ # 3. Fall back to default credentials path (lowest priority)
229
+ default_cred_path = _get_default_credentials_path()
230
+ return _sdk(None, default_cred_path)
231
+
232
+
233
+ @annotations.lru_cache(scope='request')
234
+ def _sdk(token: Optional[str], cred_path: Optional[str]):
235
+ # Exactly one of token or cred_path must be provided
236
+ assert (token is None) != (cred_path is None), (token, cred_path)
237
+ if token is not None:
238
+ return nebius.sdk.SDK(credentials=token, domain=api_domain())
239
+ if cred_path is not None:
240
+ return nebius.sdk.SDK(
241
+ credentials_file_name=os.path.expanduser(cred_path),
242
+ domain=api_domain(),
243
+ )
244
+ raise ValueError('Either token or credentials file path must be provided')
110
245
 
111
246
 
112
247
  def get_nebius_credentials(boto3_session):
@@ -184,3 +319,21 @@ def botocore_exceptions():
184
319
  # pylint: disable=import-outside-toplevel
185
320
  from botocore import exceptions
186
321
  return exceptions
322
+
323
+
324
+ def get_credential_file_paths() -> List[str]:
325
+ """Get the list of credential file paths based on current configuration."""
326
+ paths = {
327
+ # Always include tenant ID and IAM token paths
328
+ tenant_id_path(),
329
+ iam_token_path(),
330
+ }
331
+
332
+ # Add workspace-specific credentials path if set
333
+ workspace_cred_path = _get_workspace_credentials_path()
334
+ if workspace_cred_path is not None:
335
+ paths.add(workspace_cred_path)
336
+ # Always add default path in case it's needed for fallback
337
+ paths.add(_get_default_credentials_path())
338
+
339
+ return list(paths)
@@ -0,0 +1 @@
1
+ """Prime Intellect cloud adaptor."""
sky/adaptors/runpod.py CHANGED
@@ -1,8 +1,76 @@
1
1
  """RunPod cloud adaptor."""
2
2
 
3
+ import os
4
+ import time
5
+ from typing import Any, Dict, Optional
6
+
3
7
  from sky.adaptors import common
4
8
 
5
9
  runpod = common.LazyImport(
6
10
  'runpod',
7
11
  import_error_message='Failed to import dependencies for RunPod. '
8
12
  'Try running: pip install "skypilot[runpod]"')
13
+
14
+ # Lazy imports
15
+ requests = common.LazyImport('requests')
16
+
17
+ _REST_BASE = 'https://rest.runpod.io/v1'
18
+ _MAX_RETRIES = 3
19
+ _TIMEOUT = 10
20
+
21
+
22
+ def _get_api_key() -> str:
23
+ api_key = getattr(runpod, 'api_key', None)
24
+ if not api_key:
25
+ # Fallback to env if SDK global not set
26
+ api_key = os.environ.get('RUNPOD_API_KEY')
27
+ if not api_key:
28
+ raise RuntimeError(
29
+ 'RunPod API key is not set. Please set runpod.api_key '
30
+ 'or RUNPOD_API_KEY.')
31
+ return str(api_key)
32
+
33
+
34
+ def rest_request(method: str,
35
+ path: str,
36
+ json: Optional[Dict[str, Any]] = None) -> Any:
37
+ url = f'{_REST_BASE}{path}'
38
+ headers = {
39
+ 'Authorization': f'Bearer {_get_api_key()}',
40
+ 'Content-Type': 'application/json',
41
+ }
42
+ attempt = 0
43
+ while True:
44
+ attempt += 1
45
+ try:
46
+ resp = requests.request(method,
47
+ url,
48
+ headers=headers,
49
+ json=json,
50
+ timeout=_TIMEOUT)
51
+ except Exception as e: # pylint: disable=broad-except
52
+ # Retry on transient network errors
53
+ if attempt >= _MAX_RETRIES:
54
+ raise RuntimeError(f'RunPod REST network error: {e}') from e
55
+ time.sleep(1)
56
+ continue
57
+
58
+ # Retry on 5xx and 429
59
+ if resp.status_code >= 500 or resp.status_code == 429:
60
+ if attempt >= _MAX_RETRIES:
61
+ raise RuntimeError(
62
+ f'RunPod REST error {resp.status_code}: {resp.text}')
63
+ time.sleep(1)
64
+ continue
65
+
66
+ if resp.status_code >= 400:
67
+ # Non-retryable client error
68
+ raise RuntimeError(
69
+ f'RunPod REST error {resp.status_code}: {resp.text}')
70
+
71
+ if resp.text:
72
+ try:
73
+ return resp.json()
74
+ except Exception: # pylint: disable=broad-except
75
+ return resp.text
76
+ return None