skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/utils/common_utils.py CHANGED
@@ -1,16 +1,19 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
+ import ctypes
3
4
  import difflib
5
+ import enum
4
6
  import functools
7
+ import gc
5
8
  import getpass
6
9
  import hashlib
7
10
  import inspect
8
- import io
9
11
  import os
10
12
  import platform
11
13
  import random
12
14
  import re
13
15
  import socket
16
+ import subprocess
14
17
  import sys
15
18
  import time
16
19
  import typing
@@ -20,6 +23,7 @@ import uuid
20
23
  import jsonschema
21
24
 
22
25
  from sky import exceptions
26
+ from sky import models
23
27
  from sky import sky_logging
24
28
  from sky.adaptors import common as adaptors_common
25
29
  from sky.skylet import constants
@@ -31,13 +35,11 @@ from sky.utils import validator
31
35
  if typing.TYPE_CHECKING:
32
36
  import jinja2
33
37
  import psutil
34
- import yaml
35
38
  else:
36
39
  jinja2 = adaptors_common.LazyImport('jinja2')
37
40
  psutil = adaptors_common.LazyImport('psutil')
38
- yaml = adaptors_common.LazyImport('yaml')
39
41
 
40
- _USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
42
+ USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
41
43
  USER_HASH_LENGTH = 8
42
44
 
43
45
  # We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
@@ -52,6 +54,25 @@ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
52
54
  logger = sky_logging.init_logger(__name__)
53
55
 
54
56
 
57
+ class ProcessStatus(enum.Enum):
58
+ """Process status."""
59
+
60
+ # The process is scheduled to run, but not started yet.
61
+ SCHEDULED = 'SCHEDULED'
62
+
63
+ # The process is running
64
+ RUNNING = 'RUNNING'
65
+
66
+ # The process is finished and succeeded
67
+ SUCCEEDED = 'SUCCEEDED'
68
+
69
+ # The process is interrupted
70
+ INTERRUPTED = 'INTERRUPTED'
71
+
72
+ # The process failed
73
+ FAILED = 'FAILED'
74
+
75
+
55
76
  @annotations.lru_cache(scope='request')
56
77
  def get_usage_run_id() -> str:
57
78
  """Returns a unique run id for each 'run'.
@@ -69,11 +90,10 @@ def get_usage_run_id() -> str:
69
90
  def is_valid_user_hash(user_hash: Optional[str]) -> bool:
70
91
  if user_hash is None:
71
92
  return False
72
- try:
73
- int(user_hash, 16)
74
- except (TypeError, ValueError):
75
- return False
76
- return len(user_hash) == USER_HASH_LENGTH
93
+ # Must start with a letter, followed by alphanumeric characters and hyphens
94
+ # This covers both old hex format (e.g., "abc123") and new service account
95
+ # format (e.g., "sa-abc123-token-xyz")
96
+ return bool(re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]*$', user_hash))
77
97
 
78
98
 
79
99
  def generate_user_hash() -> str:
@@ -86,6 +106,18 @@ def generate_user_hash() -> str:
86
106
  return user_hash
87
107
 
88
108
 
109
+ def get_git_commit(path: Optional[str] = None) -> Optional[str]:
110
+ try:
111
+ result = subprocess.run(['git', 'rev-parse', 'HEAD'],
112
+ capture_output=True,
113
+ text=True,
114
+ cwd=path,
115
+ check=True)
116
+ return result.stdout.strip()
117
+ except subprocess.CalledProcessError:
118
+ return None
119
+
120
+
89
121
  def get_user_hash() -> str:
90
122
  """Returns a unique user-machine specific hash as a user id.
91
123
 
@@ -97,21 +129,26 @@ def get_user_hash() -> str:
97
129
  assert user_hash is not None
98
130
  return user_hash
99
131
 
100
- if os.path.exists(_USER_HASH_FILE):
132
+ if os.path.exists(USER_HASH_FILE):
101
133
  # Read from cached user hash file.
102
- with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
134
+ with open(USER_HASH_FILE, 'r', encoding='utf-8') as f:
103
135
  # Remove invalid characters.
104
136
  user_hash = f.read().strip()
105
137
  if is_valid_user_hash(user_hash):
106
138
  return user_hash
107
139
 
108
140
  user_hash = generate_user_hash()
109
- os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
110
- with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
111
- f.write(user_hash)
141
+ set_user_hash_locally(user_hash)
112
142
  return user_hash
113
143
 
114
144
 
145
+ def set_user_hash_locally(user_hash: str) -> None:
146
+ """Sets the user hash to local file."""
147
+ os.makedirs(os.path.dirname(USER_HASH_FILE), exist_ok=True)
148
+ with open(USER_HASH_FILE, 'w', encoding='utf-8') as f:
149
+ f.write(user_hash)
150
+
151
+
115
152
  def base36_encode(hex_str: str) -> str:
116
153
  """Converts a hex string to a base36 string."""
117
154
  int_value = int(hex_str, 16)
@@ -228,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
228
265
 
229
266
  class Backoff:
230
267
  """Exponential backoff with jittering."""
231
- MULTIPLIER = 1.6
232
268
  JITTER = 0.4
233
269
 
234
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
235
274
  self._initial = True
236
275
  self._backoff = 0.0
237
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
238
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
239
279
 
240
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -246,7 +286,7 @@ class Backoff:
246
286
  self._initial = False
247
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
248
288
  else:
249
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
250
290
  self._max_backoff)
251
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
252
292
  self.JITTER * self._backoff)
@@ -256,11 +296,14 @@ class Backoff:
256
296
  _current_command: Optional[str] = None
257
297
  _current_client_entrypoint: Optional[str] = None
258
298
  _using_remote_api_server: Optional[bool] = None
299
+ _current_user: Optional['models.User'] = None
300
+ _current_request_id: Optional[str] = None
259
301
 
260
302
 
261
- def set_client_status(client_entrypoint: Optional[str],
262
- client_command: Optional[str],
263
- using_remote_api_server: bool):
303
+ def set_request_context(client_entrypoint: Optional[str],
304
+ client_command: Optional[str],
305
+ using_remote_api_server: bool,
306
+ user: Optional['models.User'], request_id: str) -> None:
264
307
  """Override the current client entrypoint and command.
265
308
 
266
309
  This is useful when we are on the SkyPilot API server side and we have a
@@ -269,9 +312,20 @@ def set_client_status(client_entrypoint: Optional[str],
269
312
  global _current_command
270
313
  global _current_client_entrypoint
271
314
  global _using_remote_api_server
315
+ global _current_user
316
+ global _current_request_id
272
317
  _current_command = client_command
273
318
  _current_client_entrypoint = client_entrypoint
274
319
  _using_remote_api_server = using_remote_api_server
320
+ _current_user = user
321
+ _current_request_id = request_id
322
+
323
+
324
+ def get_current_request_id() -> str:
325
+ """Returns the current request id."""
326
+ if _current_request_id is not None:
327
+ return _current_request_id
328
+ return 'dummy-request-id'
275
329
 
276
330
 
277
331
  def get_current_command() -> str:
@@ -286,6 +340,26 @@ def get_current_command() -> str:
286
340
  return get_pretty_entrypoint_cmd()
287
341
 
288
342
 
343
+ def get_current_user() -> 'models.User':
344
+ """Returns the current user."""
345
+ if _current_user is not None:
346
+ return _current_user
347
+ return models.User.get_current_user()
348
+
349
+
350
+ def get_current_user_name() -> str:
351
+ """Returns the current user name."""
352
+ name = get_current_user().name
353
+ assert name is not None
354
+ return name
355
+
356
+
357
+ def set_current_user(user: 'models.User'):
358
+ """Sets the current user."""
359
+ global _current_user
360
+ _current_user = user
361
+
362
+
289
363
  def get_current_client_entrypoint(server_entrypoint: str) -> str:
290
364
  """Returns the current client entrypoint.
291
365
 
@@ -324,9 +398,154 @@ def get_pretty_entrypoint_cmd() -> str:
324
398
  # Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
325
399
  # things like 'examples/app.py'.
326
400
  argv[0] = basename
401
+
402
+ # Redact sensitive values from secrets arguments
403
+ argv = _redact_secrets_values(argv)
404
+
327
405
  return ' '.join(argv)
328
406
 
329
407
 
408
+ def read_last_n_lines(file_path: str,
409
+ n: int,
410
+ chunk_size: int = 8192,
411
+ encoding: str = 'utf-8',
412
+ errors: str = 'replace') -> List[str]:
413
+ """Read the last N lines of a file.
414
+
415
+ Args:
416
+ file_path: Path to the file to read.
417
+ n: Number of lines to read from the end of the file.
418
+ chunk_size: Size of chunks in bytes.
419
+ encoding: Encoding to use when decoding binary chunks.
420
+ errors: Error handling for decode errors (e.g., 'replace', 'ignore').
421
+
422
+ Returns:
423
+ A list of the last N lines, preserving newlines where applicable.
424
+ """
425
+
426
+ assert n >= 0, f'n must be non-negative. Got {n}'
427
+ assert chunk_size > 0, f'chunk_size must be positive. Got {chunk_size}'
428
+ assert os.path.exists(file_path), f'File not found: {file_path}'
429
+
430
+ if n == 0:
431
+ return []
432
+
433
+ try:
434
+ with open(file_path, 'rb') as f:
435
+ # Start reading from the end of the file
436
+ f.seek(0, os.SEEK_END)
437
+ file_size = f.tell()
438
+ if file_size == 0:
439
+ return []
440
+
441
+ pos = file_size
442
+ lines_found = 0
443
+ chunks = []
444
+
445
+ # Read backwards in chunks until we've found at least n newlines
446
+ while pos > 0 and lines_found <= n:
447
+ read_size = min(chunk_size, pos)
448
+ pos -= read_size
449
+ f.seek(pos)
450
+ chunk = f.read(read_size)
451
+ chunks.append(chunk)
452
+ lines_found += chunk.count(b'\n')
453
+
454
+ # Combine all chunks in reverse order since we read backwards
455
+ full_bytes = b''.join(reversed(chunks))
456
+
457
+ # Split by newline byte. Note: this handles '\n' endings.
458
+ all_lines = full_bytes.split(b'\n')
459
+
460
+ # Handle edge case: if file ends with a newline, last element is b''
461
+ if all_lines and all_lines[-1] == b'':
462
+ result_bytes = all_lines[-n - 1:-1]
463
+ else:
464
+ result_bytes = all_lines[-n:]
465
+
466
+ # Decode each line and normalize CR/LF endings
467
+ decoded_lines = [
468
+ line.decode(encoding, errors=errors).rstrip('\r') + '\n'
469
+ for line in result_bytes[:-1]
470
+ ]
471
+
472
+ # Decode the final line — only add newline if it was present
473
+ last_line = result_bytes[-1].decode(encoding,
474
+ errors=errors).rstrip('\r')
475
+ decoded_lines.append(last_line)
476
+
477
+ return decoded_lines
478
+
479
+ except OSError as e:
480
+ with ux_utils.print_exception_no_traceback():
481
+ raise RuntimeError(
482
+ f'Failed to read last {n} lines from {file_path}: {e}') from e
483
+
484
+
485
+ def _redact_secrets_values(argv: List[str]) -> List[str]:
486
+ """Redact sensitive values from --secret arguments.
487
+
488
+ Args:
489
+ argv: Command line arguments
490
+
491
+ Returns:
492
+ Modified argv with redacted --secret values, or original argv if any
493
+ error
494
+
495
+ Examples:
496
+ ['sky', 'launch', '--secret', 'HF_TOKEN=secret'] ->
497
+ ['sky', 'launch', '--secret', 'HF_TOKEN=<redacted>']
498
+
499
+ ['sky', 'launch', '--secret=HF_TOKEN=secret'] ->
500
+ ['sky', 'launch', '--secret=HF_TOKEN=<redacted>']
501
+
502
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] ->
503
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] (no change)
504
+ """
505
+ try:
506
+ if not argv:
507
+ return argv or []
508
+
509
+ result = []
510
+ i = 0
511
+
512
+ while i < len(argv):
513
+ arg = argv[i]
514
+
515
+ # Ensure arg is a string
516
+ if not isinstance(arg, str):
517
+ result.append(arg)
518
+ i += 1
519
+ continue
520
+
521
+ if arg == '--secret' and i + 1 < len(argv):
522
+ result.append(arg)
523
+ next_arg = argv[i + 1]
524
+ # Ensure next_arg is a string and handle redaction safely
525
+ if isinstance(next_arg, str):
526
+ redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
527
+ next_arg)
528
+ result.append(redacted)
529
+ else:
530
+ result.append(next_arg)
531
+ i += 2
532
+ elif arg.startswith('--secret='):
533
+ # Redact only if there's a value after the key
534
+ redacted = re.sub(r'^(--secret=[^=]+)=.*', r'\1=<redacted>',
535
+ arg)
536
+ result.append(redacted)
537
+ i += 1
538
+ else:
539
+ result.append(arg)
540
+ i += 1
541
+
542
+ return result
543
+ except Exception: # pylint: disable=broad-except
544
+ # If anything goes wrong with redaction, return original argv
545
+ # This ensures the command can still execute
546
+ return argv or []
547
+
548
+
330
549
  def user_and_hostname_hash() -> str:
331
550
  """Returns a string containing <user>-<hostname hash last 4 chars>.
332
551
 
@@ -356,69 +575,6 @@ def user_and_hostname_hash() -> str:
356
575
  return f'{getpass.getuser()}-{hostname_hash}'
357
576
 
358
577
 
359
- def read_yaml(path: Optional[str]) -> Dict[str, Any]:
360
- if path is None:
361
- raise ValueError('Attempted to read a None YAML.')
362
- with open(path, 'r', encoding='utf-8') as f:
363
- config = yaml.safe_load(f)
364
- return config
365
-
366
-
367
- def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
368
- stream = io.StringIO(yaml_str)
369
- config = yaml.safe_load_all(stream)
370
- configs = list(config)
371
- if not configs:
372
- # Empty YAML file.
373
- return [{}]
374
- return configs
375
-
376
-
377
- def read_yaml_all(path: str) -> List[Dict[str, Any]]:
378
- with open(path, 'r', encoding='utf-8') as f:
379
- return read_yaml_all_str(f.read())
380
-
381
-
382
- def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
383
- Dict[str, Any]]) -> None:
384
- """Dumps a YAML file.
385
-
386
- Args:
387
- path: the path to the YAML file.
388
- config: the configuration to dump.
389
- """
390
- with open(path, 'w', encoding='utf-8') as f:
391
- f.write(dump_yaml_str(config))
392
-
393
-
394
- def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
395
- """Dumps a YAML string.
396
-
397
- Args:
398
- config: the configuration to dump.
399
-
400
- Returns:
401
- The YAML string.
402
- """
403
-
404
- # https://github.com/yaml/pyyaml/issues/127
405
- class LineBreakDumper(yaml.SafeDumper):
406
-
407
- def write_line_break(self, data=None):
408
- super().write_line_break(data)
409
- if len(self.indents) == 1:
410
- super().write_line_break()
411
-
412
- if isinstance(config, list):
413
- dump_func = yaml.dump_all # type: ignore
414
- else:
415
- dump_func = yaml.dump # type: ignore
416
- return dump_func(config,
417
- Dumper=LineBreakDumper,
418
- sort_keys=False,
419
- default_flow_style=False)
420
-
421
-
422
578
  def make_decorator(cls, name_or_fn: Union[str, Callable],
423
579
  **ctx_kwargs) -> Callable:
424
580
  """Make the cls a decorator.
@@ -668,7 +824,7 @@ def get_cleaned_username(username: str = '') -> str:
668
824
  Returns:
669
825
  A cleaned username.
670
826
  """
671
- username = username or getpass.getuser()
827
+ username = username or get_current_user_name()
672
828
  username = username.lower()
673
829
  username = re.sub(r'[^a-z0-9-_]', '', username)
674
830
  username = re.sub(r'^[0-9-]+', '', username)
@@ -723,10 +879,43 @@ def deprecated_function(
723
879
  return new_func
724
880
 
725
881
 
726
- def truncate_long_string(s: str, max_length: int = 35) -> str:
727
- """Truncate a string to a maximum length, preserving whole words."""
882
+ def truncate_long_string(s: str,
883
+ max_length: int = 35,
884
+ truncate_middle: bool = False) -> str:
885
+ """Truncate a string to a maximum length.
886
+
887
+ Args:
888
+ s: String to truncate.
889
+ max_length: Maximum length of the truncated string.
890
+ truncate_middle: Whether to truncate in the middle of the string.
891
+ If True, the middle part of the string is replaced with '...'.
892
+ If False, truncation happens at the end preserving whole words.
893
+
894
+ Returns:
895
+ Truncated string.
896
+ """
728
897
  if len(s) <= max_length:
729
898
  return s
899
+
900
+ if truncate_middle:
901
+ # Reserve 3 characters for '...'
902
+ if max_length <= 3:
903
+ return '...'
904
+
905
+ # Calculate how many characters to keep from beginning and end
906
+ half_length = (max_length - 3) // 2
907
+ remainder = (max_length - 3) % 2
908
+
909
+ # Keep one more character at the beginning if max_length - 3 is odd
910
+ start_length = half_length + remainder
911
+ end_length = half_length
912
+
913
+ # When end_length is 0, just show the start part and '...'
914
+ if end_length == 0:
915
+ return s[:start_length] + '...'
916
+ return s[:start_length] + '...' + s[-end_length:]
917
+
918
+ # Original end-truncation logic
730
919
  splits = s.split(' ')
731
920
  if len(splits[0]) > max_length:
732
921
  return splits[0][:max_length] + '...' # Use '…'?
@@ -810,7 +999,17 @@ def get_mem_size_gb() -> float:
810
999
  except ValueError as e:
811
1000
  with ux_utils.print_exception_no_traceback():
812
1001
  raise ValueError(
813
- f'Failed to parse the memory size from {mem_size}') from e
1002
+ f'Failed to parse the memory size from {mem_size} (GB)'
1003
+ ) from e
1004
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
1005
+ if mem_size is not None:
1006
+ try:
1007
+ return float(mem_size) / (1024**3)
1008
+ except ValueError as e:
1009
+ with ux_utils.print_exception_no_traceback():
1010
+ raise ValueError(
1011
+ f'Failed to parse the memory size from {mem_size} (bytes)'
1012
+ ) from e
814
1013
  return _mem_size_gb()
815
1014
 
816
1015
 
@@ -900,3 +1099,27 @@ def _get_cgroup_memory_limit() -> Optional[int]:
900
1099
  def _is_cgroup_v2() -> bool:
901
1100
  """Return True if the environment is running cgroup v2."""
902
1101
  return os.path.isfile('/sys/fs/cgroup/cgroup.controllers')
1102
+
1103
+
1104
+ def removeprefix(string: str, prefix: str) -> str:
1105
+ if string.startswith(prefix):
1106
+ return string[len(prefix):]
1107
+ return string
1108
+
1109
+
1110
+ def release_memory():
1111
+ """Release the process memory"""
1112
+ # Do the best effort to release the python heap and let malloc_trim
1113
+ # be more efficient.
1114
+ try:
1115
+ gc.collect()
1116
+ if sys.platform.startswith('linux'):
1117
+ # Will fail on musl (alpine), but at least it works on our
1118
+ # official docker images.
1119
+ libc = ctypes.CDLL('libc.so.6')
1120
+ return libc.malloc_trim(0)
1121
+ return 0
1122
+ except Exception as e: # pylint: disable=broad-except
1123
+ logger.error(f'Failed to release memory: '
1124
+ f'{format_exception(e)}')
1125
+ return 0
sky/utils/config_utils.py CHANGED
@@ -6,6 +6,28 @@ from sky import sky_logging
6
6
 
7
7
  logger = sky_logging.init_logger(__name__)
8
8
 
9
+ _REGION_CONFIG_CLOUDS = ['nebius', 'oci']
10
+
11
+ # Kubernetes API use list to represent dictionary fields with patch strategy
12
+ # merge and each item is indexed by the patch merge key. The following map
13
+ # maps the field name to the patch merge key.
14
+ # pylint: disable=line-too-long
15
+ # Ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#podspec-v1-core
16
+ # NOTE: field containers and imagePullSecrets are not included deliberately for
17
+ # backward compatibility (we only support one container per pod now).
18
+ _PATCH_MERGE_KEYS = {
19
+ 'initContainers': 'name',
20
+ 'ephemeralContainers': 'name',
21
+ 'volumes': 'name',
22
+ 'volumeMounts': 'name',
23
+ 'resourceClaims': 'name',
24
+ 'env': 'name',
25
+ 'hostAliases': 'ip',
26
+ 'topologySpreadConstraints': 'topologyKey',
27
+ 'ports': 'containerPort',
28
+ 'volumeDevices': 'devicePath',
29
+ }
30
+
9
31
 
10
32
  class Config(Dict[str, Any]):
11
33
  """SkyPilot config that supports setting/getting values with nested keys."""
@@ -209,20 +231,80 @@ def merge_k8s_configs(
209
231
  merge_k8s_configs(base_config[key][0], value[0],
210
232
  next_allowed_override_keys,
211
233
  next_disallowed_override_keys)
212
- elif key in ['volumes', 'volumeMounts', 'initContainers']:
213
- # If the key is 'volumes', 'volumeMounts', or 'initContainers',
214
- # we search for item with the same name and merge it.
234
+ # For list fields with patch strategy "merge", we merge the list
235
+ # by the patch merge key.
236
+ elif key in _PATCH_MERGE_KEYS:
237
+ patch_merge_key = _PATCH_MERGE_KEYS[key]
215
238
  for override_item in value:
216
- override_item_name = override_item.get('name')
239
+ override_item_name = override_item.get(patch_merge_key)
217
240
  if override_item_name is not None:
218
241
  existing_base_item = next(
219
242
  (v for v in base_config[key]
220
- if v.get('name') == override_item_name), None)
243
+ if v.get(patch_merge_key) == override_item_name),
244
+ None)
221
245
  if existing_base_item is not None:
222
246
  merge_k8s_configs(existing_base_item, override_item)
223
247
  else:
224
248
  base_config[key].append(override_item)
249
+ else:
250
+ base_config[key].append(override_item)
225
251
  else:
226
252
  base_config[key].extend(value)
227
253
  else:
228
254
  base_config[key] = value
255
+
256
+
257
+ def get_cloud_config_value_from_dict(
258
+ dict_config: Dict[str, Any],
259
+ cloud: str,
260
+ keys: Tuple[str, ...],
261
+ region: Optional[str] = None,
262
+ default_value: Optional[Any] = None,
263
+ override_configs: Optional[Dict[str, Any]] = None) -> Any:
264
+ """Returns the nested key value by reading from config
265
+ Order to get the property_name value:
266
+ 1. if region is specified,
267
+ try to get the value from <cloud>/<region_key>/<region>/keys
268
+ 2. if no region or no override,
269
+ try to get it at the cloud level <cloud>/keys
270
+ 3. if not found at cloud level,
271
+ return either default_value if specified or None
272
+ """
273
+ input_config = Config(dict_config)
274
+ region_key = None
275
+ if cloud == 'kubernetes':
276
+ region_key = 'context_configs'
277
+ elif cloud in _REGION_CONFIG_CLOUDS:
278
+ region_key = 'region_configs'
279
+
280
+ per_context_config = None
281
+ if region is not None and region_key is not None:
282
+ per_context_config = input_config.get_nested(
283
+ keys=(cloud, region_key, region) + keys,
284
+ default_value=None,
285
+ override_configs=override_configs)
286
+ if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
287
+ # TODO (kyuds): Backward compatibility, remove after 0.11.0.
288
+ per_context_config = input_config.get_nested(
289
+ keys=(cloud, region) + keys,
290
+ default_value=None,
291
+ override_configs=override_configs)
292
+ if per_context_config is not None:
293
+ logger.info(
294
+ f'{cloud} configuration is using the legacy format. \n'
295
+ 'This format will be deprecated after 0.11.0, refer to '
296
+ '`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
297
+ 'for the new format. Please use `region_configs` to specify region specific configuration.'
298
+ )
299
+ # if no override found for specified region
300
+ general_config = input_config.get_nested(keys=(cloud,) + keys,
301
+ default_value=default_value,
302
+ override_configs=override_configs)
303
+
304
+ if (cloud == 'kubernetes' and isinstance(general_config, dict) and
305
+ isinstance(per_context_config, dict)):
306
+ merge_k8s_configs(general_config, per_context_config)
307
+ return general_config
308
+ else:
309
+ return (general_config
310
+ if per_context_config is None else per_context_config)