skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/common_utils.py CHANGED
@@ -1,16 +1,19 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
+ import ctypes
3
4
  import difflib
5
+ import enum
4
6
  import functools
7
+ import gc
5
8
  import getpass
6
9
  import hashlib
7
10
  import inspect
8
- import io
9
11
  import os
10
12
  import platform
11
13
  import random
12
14
  import re
13
15
  import socket
16
+ import subprocess
14
17
  import sys
15
18
  import time
16
19
  import typing
@@ -20,6 +23,7 @@ import uuid
20
23
  import jsonschema
21
24
 
22
25
  from sky import exceptions
26
+ from sky import models
23
27
  from sky import sky_logging
24
28
  from sky.adaptors import common as adaptors_common
25
29
  from sky.skylet import constants
@@ -31,13 +35,11 @@ from sky.utils import validator
31
35
  if typing.TYPE_CHECKING:
32
36
  import jinja2
33
37
  import psutil
34
- import yaml
35
38
  else:
36
39
  jinja2 = adaptors_common.LazyImport('jinja2')
37
40
  psutil = adaptors_common.LazyImport('psutil')
38
- yaml = adaptors_common.LazyImport('yaml')
39
41
 
40
- _USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
42
+ USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
41
43
  USER_HASH_LENGTH = 8
42
44
 
43
45
  # We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
@@ -52,6 +54,25 @@ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
52
54
  logger = sky_logging.init_logger(__name__)
53
55
 
54
56
 
57
+ class ProcessStatus(enum.Enum):
58
+ """Process status."""
59
+
60
+ # The process is scheduled to run, but not started yet.
61
+ SCHEDULED = 'SCHEDULED'
62
+
63
+ # The process is running
64
+ RUNNING = 'RUNNING'
65
+
66
+ # The process is finished and succeeded
67
+ SUCCEEDED = 'SUCCEEDED'
68
+
69
+ # The process is interrupted
70
+ INTERRUPTED = 'INTERRUPTED'
71
+
72
+ # The process failed
73
+ FAILED = 'FAILED'
74
+
75
+
55
76
  @annotations.lru_cache(scope='request')
56
77
  def get_usage_run_id() -> str:
57
78
  """Returns a unique run id for each 'run'.
@@ -66,26 +87,37 @@ def get_usage_run_id() -> str:
66
87
  return str(uuid.uuid4())
67
88
 
68
89
 
69
- def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
90
+ def is_valid_user_hash(user_hash: Optional[str]) -> bool:
70
91
  if user_hash is None:
71
92
  return False
72
- try:
73
- int(user_hash, 16)
74
- except (TypeError, ValueError):
75
- return False
76
- return len(user_hash) == USER_HASH_LENGTH
93
+ # Must start with a letter, followed by alphanumeric characters and hyphens
94
+ # This covers both old hex format (e.g., "abc123") and new service account
95
+ # format (e.g., "sa-abc123-token-xyz")
96
+ return bool(re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]*$', user_hash))
77
97
 
78
98
 
79
99
  def generate_user_hash() -> str:
80
100
  """Generates a unique user-machine specific hash."""
81
101
  hash_str = user_and_hostname_hash()
82
102
  user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
83
- if not _is_valid_user_hash(user_hash):
103
+ if not is_valid_user_hash(user_hash):
84
104
  # A fallback in case the hash is invalid.
85
105
  user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
86
106
  return user_hash
87
107
 
88
108
 
109
+ def get_git_commit(path: Optional[str] = None) -> Optional[str]:
110
+ try:
111
+ result = subprocess.run(['git', 'rev-parse', 'HEAD'],
112
+ capture_output=True,
113
+ text=True,
114
+ cwd=path,
115
+ check=True)
116
+ return result.stdout.strip()
117
+ except subprocess.CalledProcessError:
118
+ return None
119
+
120
+
89
121
  def get_user_hash() -> str:
90
122
  """Returns a unique user-machine specific hash as a user id.
91
123
 
@@ -93,25 +125,30 @@ def get_user_hash() -> str:
93
125
  hostname changes causing a new user hash to be generated.
94
126
  """
95
127
  user_hash = os.getenv(constants.USER_ID_ENV_VAR)
96
- if _is_valid_user_hash(user_hash):
128
+ if is_valid_user_hash(user_hash):
97
129
  assert user_hash is not None
98
130
  return user_hash
99
131
 
100
- if os.path.exists(_USER_HASH_FILE):
132
+ if os.path.exists(USER_HASH_FILE):
101
133
  # Read from cached user hash file.
102
- with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
134
+ with open(USER_HASH_FILE, 'r', encoding='utf-8') as f:
103
135
  # Remove invalid characters.
104
136
  user_hash = f.read().strip()
105
- if _is_valid_user_hash(user_hash):
137
+ if is_valid_user_hash(user_hash):
106
138
  return user_hash
107
139
 
108
140
  user_hash = generate_user_hash()
109
- os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
110
- with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
111
- f.write(user_hash)
141
+ set_user_hash_locally(user_hash)
112
142
  return user_hash
113
143
 
114
144
 
145
+ def set_user_hash_locally(user_hash: str) -> None:
146
+ """Sets the user hash to local file."""
147
+ os.makedirs(os.path.dirname(USER_HASH_FILE), exist_ok=True)
148
+ with open(USER_HASH_FILE, 'w', encoding='utf-8') as f:
149
+ f.write(user_hash)
150
+
151
+
115
152
  def base36_encode(hex_str: str) -> str:
116
153
  """Converts a hex string to a base36 string."""
117
154
  int_value = int(hex_str, 16)
@@ -228,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
228
265
 
229
266
  class Backoff:
230
267
  """Exponential backoff with jittering."""
231
- MULTIPLIER = 1.6
232
268
  JITTER = 0.4
233
269
 
234
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
235
274
  self._initial = True
236
275
  self._backoff = 0.0
237
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
238
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
239
279
 
240
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -246,7 +286,7 @@ class Backoff:
246
286
  self._initial = False
247
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
248
288
  else:
249
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
250
290
  self._max_backoff)
251
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
252
292
  self.JITTER * self._backoff)
@@ -256,11 +296,14 @@ class Backoff:
256
296
  _current_command: Optional[str] = None
257
297
  _current_client_entrypoint: Optional[str] = None
258
298
  _using_remote_api_server: Optional[bool] = None
299
+ _current_user: Optional['models.User'] = None
300
+ _current_request_id: Optional[str] = None
259
301
 
260
302
 
261
- def set_client_status(client_entrypoint: Optional[str],
262
- client_command: Optional[str],
263
- using_remote_api_server: bool):
303
+ def set_request_context(client_entrypoint: Optional[str],
304
+ client_command: Optional[str],
305
+ using_remote_api_server: bool,
306
+ user: Optional['models.User'], request_id: str) -> None:
264
307
  """Override the current client entrypoint and command.
265
308
 
266
309
  This is useful when we are on the SkyPilot API server side and we have a
@@ -269,9 +312,20 @@ def set_client_status(client_entrypoint: Optional[str],
269
312
  global _current_command
270
313
  global _current_client_entrypoint
271
314
  global _using_remote_api_server
315
+ global _current_user
316
+ global _current_request_id
272
317
  _current_command = client_command
273
318
  _current_client_entrypoint = client_entrypoint
274
319
  _using_remote_api_server = using_remote_api_server
320
+ _current_user = user
321
+ _current_request_id = request_id
322
+
323
+
324
+ def get_current_request_id() -> str:
325
+ """Returns the current request id."""
326
+ if _current_request_id is not None:
327
+ return _current_request_id
328
+ return 'dummy-request-id'
275
329
 
276
330
 
277
331
  def get_current_command() -> str:
@@ -286,6 +340,26 @@ def get_current_command() -> str:
286
340
  return get_pretty_entrypoint_cmd()
287
341
 
288
342
 
343
+ def get_current_user() -> 'models.User':
344
+ """Returns the current user."""
345
+ if _current_user is not None:
346
+ return _current_user
347
+ return models.User.get_current_user()
348
+
349
+
350
+ def get_current_user_name() -> str:
351
+ """Returns the current user name."""
352
+ name = get_current_user().name
353
+ assert name is not None
354
+ return name
355
+
356
+
357
+ def set_current_user(user: 'models.User'):
358
+ """Sets the current user."""
359
+ global _current_user
360
+ _current_user = user
361
+
362
+
289
363
  def get_current_client_entrypoint(server_entrypoint: str) -> str:
290
364
  """Returns the current client entrypoint.
291
365
 
@@ -324,9 +398,154 @@ def get_pretty_entrypoint_cmd() -> str:
324
398
  # Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
325
399
  # things like 'examples/app.py'.
326
400
  argv[0] = basename
401
+
402
+ # Redact sensitive values from secrets arguments
403
+ argv = _redact_secrets_values(argv)
404
+
327
405
  return ' '.join(argv)
328
406
 
329
407
 
408
+ def read_last_n_lines(file_path: str,
409
+ n: int,
410
+ chunk_size: int = 8192,
411
+ encoding: str = 'utf-8',
412
+ errors: str = 'replace') -> List[str]:
413
+ """Read the last N lines of a file.
414
+
415
+ Args:
416
+ file_path: Path to the file to read.
417
+ n: Number of lines to read from the end of the file.
418
+ chunk_size: Size of chunks in bytes.
419
+ encoding: Encoding to use when decoding binary chunks.
420
+ errors: Error handling for decode errors (e.g., 'replace', 'ignore').
421
+
422
+ Returns:
423
+ A list of the last N lines, preserving newlines where applicable.
424
+ """
425
+
426
+ assert n >= 0, f'n must be non-negative. Got {n}'
427
+ assert chunk_size > 0, f'chunk_size must be positive. Got {chunk_size}'
428
+ assert os.path.exists(file_path), f'File not found: {file_path}'
429
+
430
+ if n == 0:
431
+ return []
432
+
433
+ try:
434
+ with open(file_path, 'rb') as f:
435
+ # Start reading from the end of the file
436
+ f.seek(0, os.SEEK_END)
437
+ file_size = f.tell()
438
+ if file_size == 0:
439
+ return []
440
+
441
+ pos = file_size
442
+ lines_found = 0
443
+ chunks = []
444
+
445
+ # Read backwards in chunks until we've found at least n newlines
446
+ while pos > 0 and lines_found <= n:
447
+ read_size = min(chunk_size, pos)
448
+ pos -= read_size
449
+ f.seek(pos)
450
+ chunk = f.read(read_size)
451
+ chunks.append(chunk)
452
+ lines_found += chunk.count(b'\n')
453
+
454
+ # Combine all chunks in reverse order since we read backwards
455
+ full_bytes = b''.join(reversed(chunks))
456
+
457
+ # Split by newline byte. Note: this handles '\n' endings.
458
+ all_lines = full_bytes.split(b'\n')
459
+
460
+ # Handle edge case: if file ends with a newline, last element is b''
461
+ if all_lines and all_lines[-1] == b'':
462
+ result_bytes = all_lines[-n - 1:-1]
463
+ else:
464
+ result_bytes = all_lines[-n:]
465
+
466
+ # Decode each line and normalize CR/LF endings
467
+ decoded_lines = [
468
+ line.decode(encoding, errors=errors).rstrip('\r') + '\n'
469
+ for line in result_bytes[:-1]
470
+ ]
471
+
472
+ # Decode the final line — only add newline if it was present
473
+ last_line = result_bytes[-1].decode(encoding,
474
+ errors=errors).rstrip('\r')
475
+ decoded_lines.append(last_line)
476
+
477
+ return decoded_lines
478
+
479
+ except OSError as e:
480
+ with ux_utils.print_exception_no_traceback():
481
+ raise RuntimeError(
482
+ f'Failed to read last {n} lines from {file_path}: {e}') from e
483
+
484
+
485
+ def _redact_secrets_values(argv: List[str]) -> List[str]:
486
+ """Redact sensitive values from --secret arguments.
487
+
488
+ Args:
489
+ argv: Command line arguments
490
+
491
+ Returns:
492
+ Modified argv with redacted --secret values, or original argv if any
493
+ error
494
+
495
+ Examples:
496
+ ['sky', 'launch', '--secret', 'HF_TOKEN=secret'] ->
497
+ ['sky', 'launch', '--secret', 'HF_TOKEN=<redacted>']
498
+
499
+ ['sky', 'launch', '--secret=HF_TOKEN=secret'] ->
500
+ ['sky', 'launch', '--secret=HF_TOKEN=<redacted>']
501
+
502
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] ->
503
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] (no change)
504
+ """
505
+ try:
506
+ if not argv:
507
+ return argv or []
508
+
509
+ result = []
510
+ i = 0
511
+
512
+ while i < len(argv):
513
+ arg = argv[i]
514
+
515
+ # Ensure arg is a string
516
+ if not isinstance(arg, str):
517
+ result.append(arg)
518
+ i += 1
519
+ continue
520
+
521
+ if arg == '--secret' and i + 1 < len(argv):
522
+ result.append(arg)
523
+ next_arg = argv[i + 1]
524
+ # Ensure next_arg is a string and handle redaction safely
525
+ if isinstance(next_arg, str):
526
+ redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
527
+ next_arg)
528
+ result.append(redacted)
529
+ else:
530
+ result.append(next_arg)
531
+ i += 2
532
+ elif arg.startswith('--secret='):
533
+ # Redact only if there's a value after the key
534
+ redacted = re.sub(r'^(--secret=[^=]+)=.*', r'\1=<redacted>',
535
+ arg)
536
+ result.append(redacted)
537
+ i += 1
538
+ else:
539
+ result.append(arg)
540
+ i += 1
541
+
542
+ return result
543
+ except Exception: # pylint: disable=broad-except
544
+ # If anything goes wrong with redaction, return original argv
545
+ # This ensures the command can still execute
546
+ return argv or []
547
+
548
+
330
549
  def user_and_hostname_hash() -> str:
331
550
  """Returns a string containing <user>-<hostname hash last 4 chars>.
332
551
 
@@ -356,69 +575,6 @@ def user_and_hostname_hash() -> str:
356
575
  return f'{getpass.getuser()}-{hostname_hash}'
357
576
 
358
577
 
359
- def read_yaml(path: Optional[str]) -> Dict[str, Any]:
360
- if path is None:
361
- raise ValueError('Attempted to read a None YAML.')
362
- with open(path, 'r', encoding='utf-8') as f:
363
- config = yaml.safe_load(f)
364
- return config
365
-
366
-
367
- def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
368
- stream = io.StringIO(yaml_str)
369
- config = yaml.safe_load_all(stream)
370
- configs = list(config)
371
- if not configs:
372
- # Empty YAML file.
373
- return [{}]
374
- return configs
375
-
376
-
377
- def read_yaml_all(path: str) -> List[Dict[str, Any]]:
378
- with open(path, 'r', encoding='utf-8') as f:
379
- return read_yaml_all_str(f.read())
380
-
381
-
382
- def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
383
- Dict[str, Any]]) -> None:
384
- """Dumps a YAML file.
385
-
386
- Args:
387
- path: the path to the YAML file.
388
- config: the configuration to dump.
389
- """
390
- with open(path, 'w', encoding='utf-8') as f:
391
- f.write(dump_yaml_str(config))
392
-
393
-
394
- def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
395
- """Dumps a YAML string.
396
-
397
- Args:
398
- config: the configuration to dump.
399
-
400
- Returns:
401
- The YAML string.
402
- """
403
-
404
- # https://github.com/yaml/pyyaml/issues/127
405
- class LineBreakDumper(yaml.SafeDumper):
406
-
407
- def write_line_break(self, data=None):
408
- super().write_line_break(data)
409
- if len(self.indents) == 1:
410
- super().write_line_break()
411
-
412
- if isinstance(config, list):
413
- dump_func = yaml.dump_all # type: ignore
414
- else:
415
- dump_func = yaml.dump # type: ignore
416
- return dump_func(config,
417
- Dumper=LineBreakDumper,
418
- sort_keys=False,
419
- default_flow_style=False)
420
-
421
-
422
578
  def make_decorator(cls, name_or_fn: Union[str, Callable],
423
579
  **ctx_kwargs) -> Callable:
424
580
  """Make the cls a decorator.
@@ -668,7 +824,7 @@ def get_cleaned_username(username: str = '') -> str:
668
824
  Returns:
669
825
  A cleaned username.
670
826
  """
671
- username = username or getpass.getuser()
827
+ username = username or get_current_user_name()
672
828
  username = username.lower()
673
829
  username = re.sub(r'[^a-z0-9-_]', '', username)
674
830
  username = re.sub(r'^[0-9-]+', '', username)
@@ -723,10 +879,43 @@ def deprecated_function(
723
879
  return new_func
724
880
 
725
881
 
726
- def truncate_long_string(s: str, max_length: int = 35) -> str:
727
- """Truncate a string to a maximum length, preserving whole words."""
882
+ def truncate_long_string(s: str,
883
+ max_length: int = 35,
884
+ truncate_middle: bool = False) -> str:
885
+ """Truncate a string to a maximum length.
886
+
887
+ Args:
888
+ s: String to truncate.
889
+ max_length: Maximum length of the truncated string.
890
+ truncate_middle: Whether to truncate in the middle of the string.
891
+ If True, the middle part of the string is replaced with '...'.
892
+ If False, truncation happens at the end preserving whole words.
893
+
894
+ Returns:
895
+ Truncated string.
896
+ """
728
897
  if len(s) <= max_length:
729
898
  return s
899
+
900
+ if truncate_middle:
901
+ # Reserve 3 characters for '...'
902
+ if max_length <= 3:
903
+ return '...'
904
+
905
+ # Calculate how many characters to keep from beginning and end
906
+ half_length = (max_length - 3) // 2
907
+ remainder = (max_length - 3) % 2
908
+
909
+ # Keep one more character at the beginning if max_length - 3 is odd
910
+ start_length = half_length + remainder
911
+ end_length = half_length
912
+
913
+ # When end_length is 0, just show the start part and '...'
914
+ if end_length == 0:
915
+ return s[:start_length] + '...'
916
+ return s[:start_length] + '...' + s[-end_length:]
917
+
918
+ # Original end-truncation logic
730
919
  splits = s.split(' ')
731
920
  if len(splits[0]) > max_length:
732
921
  return splits[0][:max_length] + '...' # Use '…'?
@@ -810,7 +999,17 @@ def get_mem_size_gb() -> float:
810
999
  except ValueError as e:
811
1000
  with ux_utils.print_exception_no_traceback():
812
1001
  raise ValueError(
813
- f'Failed to parse the memory size from {mem_size}') from e
1002
+ f'Failed to parse the memory size from {mem_size} (GB)'
1003
+ ) from e
1004
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
1005
+ if mem_size is not None:
1006
+ try:
1007
+ return float(mem_size) / (1024**3)
1008
+ except ValueError as e:
1009
+ with ux_utils.print_exception_no_traceback():
1010
+ raise ValueError(
1011
+ f'Failed to parse the memory size from {mem_size} (bytes)'
1012
+ ) from e
814
1013
  return _mem_size_gb()
815
1014
 
816
1015
 
@@ -900,3 +1099,27 @@ def _get_cgroup_memory_limit() -> Optional[int]:
900
1099
  def _is_cgroup_v2() -> bool:
901
1100
  """Return True if the environment is running cgroup v2."""
902
1101
  return os.path.isfile('/sys/fs/cgroup/cgroup.controllers')
1102
+
1103
+
1104
+ def removeprefix(string: str, prefix: str) -> str:
1105
+ if string.startswith(prefix):
1106
+ return string[len(prefix):]
1107
+ return string
1108
+
1109
+
1110
+ def release_memory():
1111
+ """Release the process memory"""
1112
+ # Do the best effort to release the python heap and let malloc_trim
1113
+ # be more efficient.
1114
+ try:
1115
+ gc.collect()
1116
+ if sys.platform.startswith('linux'):
1117
+ # Will fail on musl (alpine), but at least it works on our
1118
+ # official docker images.
1119
+ libc = ctypes.CDLL('libc.so.6')
1120
+ return libc.malloc_trim(0)
1121
+ return 0
1122
+ except Exception as e: # pylint: disable=broad-except
1123
+ logger.error(f'Failed to release memory: '
1124
+ f'{format_exception(e)}')
1125
+ return 0
sky/utils/config_utils.py CHANGED
@@ -6,6 +6,28 @@ from sky import sky_logging
6
6
 
7
7
  logger = sky_logging.init_logger(__name__)
8
8
 
9
+ _REGION_CONFIG_CLOUDS = ['nebius', 'oci']
10
+
11
+ # Kubernetes API use list to represent dictionary fields with patch strategy
12
+ # merge and each item is indexed by the patch merge key. The following map
13
+ # maps the field name to the patch merge key.
14
+ # pylint: disable=line-too-long
15
+ # Ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#podspec-v1-core
16
+ # NOTE: field containers and imagePullSecrets are not included deliberately for
17
+ # backward compatibility (we only support one container per pod now).
18
+ _PATCH_MERGE_KEYS = {
19
+ 'initContainers': 'name',
20
+ 'ephemeralContainers': 'name',
21
+ 'volumes': 'name',
22
+ 'volumeMounts': 'name',
23
+ 'resourceClaims': 'name',
24
+ 'env': 'name',
25
+ 'hostAliases': 'ip',
26
+ 'topologySpreadConstraints': 'topologyKey',
27
+ 'ports': 'containerPort',
28
+ 'volumeDevices': 'devicePath',
29
+ }
30
+
9
31
 
10
32
  class Config(Dict[str, Any]):
11
33
  """SkyPilot config that supports setting/getting values with nested keys."""
@@ -209,20 +231,67 @@ def merge_k8s_configs(
209
231
  merge_k8s_configs(base_config[key][0], value[0],
210
232
  next_allowed_override_keys,
211
233
  next_disallowed_override_keys)
212
- elif key in ['volumes', 'volumeMounts', 'initContainers']:
213
- # If the key is 'volumes', 'volumeMounts', or 'initContainers',
214
- # we search for item with the same name and merge it.
234
+ # For list fields with patch strategy "merge", we merge the list
235
+ # by the patch merge key.
236
+ elif key in _PATCH_MERGE_KEYS:
237
+ patch_merge_key = _PATCH_MERGE_KEYS[key]
215
238
  for override_item in value:
216
- override_item_name = override_item.get('name')
239
+ override_item_name = override_item.get(patch_merge_key)
217
240
  if override_item_name is not None:
218
241
  existing_base_item = next(
219
242
  (v for v in base_config[key]
220
- if v.get('name') == override_item_name), None)
243
+ if v.get(patch_merge_key) == override_item_name),
244
+ None)
221
245
  if existing_base_item is not None:
222
246
  merge_k8s_configs(existing_base_item, override_item)
223
247
  else:
224
248
  base_config[key].append(override_item)
249
+ else:
250
+ base_config[key].append(override_item)
225
251
  else:
226
252
  base_config[key].extend(value)
227
253
  else:
228
254
  base_config[key] = value
255
+
256
+
257
+ def get_cloud_config_value_from_dict(
258
+ dict_config: Dict[str, Any],
259
+ cloud: str,
260
+ keys: Tuple[str, ...],
261
+ region: Optional[str] = None,
262
+ default_value: Optional[Any] = None,
263
+ override_configs: Optional[Dict[str, Any]] = None) -> Any:
264
+ """Returns the nested key value by reading from config
265
+ Order to get the property_name value:
266
+ 1. if region is specified,
267
+ try to get the value from <cloud>/<region_key>/<region>/keys
268
+ 2. if no region or no override,
269
+ try to get it at the cloud level <cloud>/keys
270
+ 3. if not found at cloud level,
271
+ return either default_value if specified or None
272
+ """
273
+ input_config = Config(dict_config)
274
+ region_key = None
275
+ if cloud in ('kubernetes', 'ssh'):
276
+ region_key = 'context_configs'
277
+ elif cloud in _REGION_CONFIG_CLOUDS:
278
+ region_key = 'region_configs'
279
+
280
+ per_context_config = None
281
+ if region is not None and region_key is not None:
282
+ per_context_config = input_config.get_nested(
283
+ keys=(cloud, region_key, region) + keys,
284
+ default_value=None,
285
+ override_configs=override_configs)
286
+ # if no override found for specified region
287
+ general_config = input_config.get_nested(keys=(cloud,) + keys,
288
+ default_value=default_value,
289
+ override_configs=override_configs)
290
+
291
+ if (cloud == 'kubernetes' and isinstance(general_config, dict) and
292
+ isinstance(per_context_config, dict)):
293
+ merge_k8s_configs(general_config, per_context_config)
294
+ return general_config
295
+ else:
296
+ return (general_config
297
+ if per_context_config is None else per_context_config)