skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -17,6 +17,16 @@ _TYPE_CACHE_TTL = '5s'
17
17
  _RENAME_DIR_LIMIT = 10000
18
18
  # https://github.com/GoogleCloudPlatform/gcsfuse/releases
19
19
  GCSFUSE_VERSION = '2.2.0'
20
+
21
+ # Some machines do not have fuse/fuse3 installed by default
22
+ # hence rclone will fail on these machines
23
+ FUSE3_INSTALL_CMD = ('(command -v fusermount3 > /dev/null 2>&1 || '
24
+ '((which apt-get > /dev/null 2>&1 && '
25
+ 'sudo apt-get update && sudo apt-get install -y fuse3) || '
26
+ '(which yum > /dev/null 2>&1 && '
27
+ 'sudo yum install -y fuse3) || '
28
+ 'true)) || true')
29
+
20
30
  # Creates a fusermount3 soft link on older (<22) Ubuntu systems to utilize
21
31
  # Rclone's mounting utility.
22
32
  FUSERMOUNT3_SOFT_LINK_CMD = ('[ ! -f /bin/fusermount3 ] && '
@@ -38,20 +48,45 @@ _GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
38
48
  'fi)')
39
49
 
40
50
 
51
+ def get_rclone_install_cmd() -> str:
52
+ """ RClone installation for both apt-get and rpm.
53
+ This would be common command.
54
+ """
55
+ # pylint: disable=line-too-long
56
+ install_cmd = (
57
+ 'ARCH=$(uname -m) && '
58
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
59
+ ' ARCH_SUFFIX="arm64"; '
60
+ 'else '
61
+ ' ARCH_SUFFIX="amd64"; '
62
+ 'fi && '
63
+ f'(which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || (cd ~ > /dev/null'
64
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
65
+ f' && sudo dpkg -i rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
66
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb)))'
67
+ f' || (which yum > /dev/null 2>&1 && (which rclone > /dev/null || (cd ~ > /dev/null'
68
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm'
69
+ f' && sudo yum --nogpgcheck install rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm -y'
70
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm)))')
71
+ return install_cmd
72
+
73
+
41
74
  def get_s3_mount_install_cmd() -> str:
42
- """Returns a command to install S3 mount utility goofys."""
75
+ """Returns command for basic S3 mounting (goofys by default, rclone for
76
+ ARM64)."""
43
77
  # TODO(aylei): maintain our goofys fork under skypilot-org
44
- install_cmd = ('ARCH=$(uname -m) && '
45
- 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
46
- ' echo "goofys is not supported on $ARCH" && '
47
- f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
48
- 'else '
49
- ' ARCH_SUFFIX="amd64"; '
50
- 'fi && '
51
- 'sudo wget -nc https://github.com/aylei/goofys/'
52
- 'releases/download/0.24.0-aylei-upstream/goofys '
53
- '-O /usr/local/bin/goofys && '
54
- 'sudo chmod 755 /usr/local/bin/goofys')
78
+ install_cmd = (
79
+ 'ARCH=$(uname -m) && '
80
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
81
+ # Use rclone for ARM64 since goofys doesn't support it
82
+ # Extract core rclone installation logic without redundant ARCH check
83
+ f' {get_rclone_install_cmd()}; '
84
+ 'else '
85
+ ' sudo wget -nc https://github.com/aylei/goofys/'
86
+ 'releases/download/0.24.0-aylei-upstream/goofys '
87
+ '-O /usr/local/bin/goofys && '
88
+ 'sudo chmod 755 /usr/local/bin/goofys; '
89
+ 'fi')
55
90
  return install_cmd
56
91
 
57
92
 
@@ -59,15 +94,33 @@ def get_s3_mount_install_cmd() -> str:
59
94
  def get_s3_mount_cmd(bucket_name: str,
60
95
  mount_path: str,
61
96
  _bucket_sub_path: Optional[str] = None) -> str:
62
- """Returns a command to mount an S3 bucket using goofys."""
97
+ """Returns a command to mount an S3 bucket (goofys by default, rclone for
98
+ ARM64)"""
63
99
  if _bucket_sub_path is None:
64
100
  _bucket_sub_path = ''
65
101
  else:
66
102
  _bucket_sub_path = f':{_bucket_sub_path}'
67
- mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
68
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
69
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
70
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
103
+
104
+ # Use rclone for ARM64 architectures since goofys doesn't support them
105
+ arch_check = 'ARCH=$(uname -m) && '
106
+ rclone_mount = (
107
+ f'{FUSE3_INSTALL_CMD} && '
108
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
109
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
110
+ # Have to add --s3-env-auth=true to allow rclone to access private
111
+ # buckets.
112
+ '--daemon --allow-other --s3-env-auth=true')
113
+ goofys_mount = (f'{_GOOFYS_WRAPPER} -o allow_other '
114
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
115
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
116
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
117
+
118
+ mount_cmd = (f'{arch_check}'
119
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
120
+ f' {rclone_mount}; '
121
+ f'else '
122
+ f' {goofys_mount}; '
123
+ f'fi')
71
124
  return mount_cmd
72
125
 
73
126
 
@@ -76,17 +129,74 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
76
129
  endpoint_url: str,
77
130
  mount_path: str,
78
131
  _bucket_sub_path: Optional[str] = None) -> str:
79
- """Returns a command to install Nebius mount utility goofys."""
132
+ """Returns a command to mount Nebius bucket (goofys by default, rclone for
133
+ ARM64)."""
80
134
  if _bucket_sub_path is None:
81
135
  _bucket_sub_path = ''
82
136
  else:
83
137
  _bucket_sub_path = f':{_bucket_sub_path}'
84
- mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
85
- '-o allow_other '
86
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
87
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
88
- f'--endpoint {endpoint_url} '
89
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
138
+
139
+ # Use rclone for ARM64 architectures since goofys doesn't support them
140
+ arch_check = 'ARCH=$(uname -m) && '
141
+ rclone_mount = (
142
+ f'{FUSE3_INSTALL_CMD} && '
143
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
144
+ f'AWS_PROFILE={nebius_profile_name} '
145
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
146
+ f'--s3-endpoint {endpoint_url} --daemon --allow-other')
147
+ goofys_mount = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
148
+ '-o allow_other '
149
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
150
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
151
+ f'--endpoint {endpoint_url} '
152
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
153
+
154
+ mount_cmd = (f'{arch_check}'
155
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
156
+ f' {rclone_mount}; '
157
+ f'else '
158
+ f' {goofys_mount}; '
159
+ f'fi')
160
+ return mount_cmd
161
+
162
+
163
+ def get_coreweave_mount_cmd(cw_credentials_path: str,
164
+ coreweave_profile_name: str,
165
+ bucket_name: str,
166
+ endpoint_url: str,
167
+ mount_path: str,
168
+ _bucket_sub_path: Optional[str] = None) -> str:
169
+ """Returns a command to mount CoreWeave bucket"""
170
+ if _bucket_sub_path is None:
171
+ _bucket_sub_path = ''
172
+ else:
173
+ _bucket_sub_path = f':{_bucket_sub_path}'
174
+
175
+ # Use rclone for ARM64 architectures since goofys doesn't support them
176
+ arch_check = 'ARCH=$(uname -m) && '
177
+ rclone_mount = (
178
+ f'{FUSE3_INSTALL_CMD} && '
179
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
180
+ f'AWS_SHARED_CREDENTIALS_FILE={cw_credentials_path} '
181
+ f'AWS_PROFILE={coreweave_profile_name} '
182
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
183
+ f'--s3-force-path-style=false '
184
+ f'--s3-endpoint {endpoint_url} --daemon --allow-other')
185
+ goofys_mount = (f'AWS_SHARED_CREDENTIALS_FILE={cw_credentials_path} '
186
+ f'AWS_PROFILE={coreweave_profile_name} {_GOOFYS_WRAPPER} '
187
+ '-o allow_other '
188
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
189
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
190
+ f'--subdomain '
191
+ f'--endpoint {endpoint_url} '
192
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
193
+
194
+ mount_cmd = (f'{arch_check}'
195
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
196
+ f' {rclone_mount}; '
197
+ f'else '
198
+ f' {goofys_mount}; '
199
+ f'fi')
90
200
  return mount_cmd
91
201
 
92
202
 
@@ -127,27 +237,72 @@ def get_gcs_mount_cmd(bucket_name: str,
127
237
  def get_az_mount_install_cmd() -> str:
128
238
  """Returns a command to install AZ Container mount utility blobfuse2."""
129
239
  install_cmd = (
130
- 'sudo apt-get update; '
131
- 'sudo apt-get install -y '
132
- '-o Dpkg::Options::="--force-confdef" '
133
- 'fuse3 libfuse3-dev || { '
134
- ' echo "fuse3 not available, falling back to fuse"; '
135
- ' sudo apt-get install -y '
136
- ' -o Dpkg::Options::="--force-confdef" '
137
- ' fuse libfuse-dev; '
138
- '} && '
240
+ # Check architecture first - blobfuse2 only supports x86_64
139
241
  'ARCH=$(uname -m) && '
140
242
  'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
141
243
  ' echo "blobfuse2 is not supported on $ARCH" && '
142
244
  f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
245
+ 'fi && '
246
+ # Try to install fuse3 from default repos
247
+ 'sudo apt-get update && '
248
+ 'FUSE3_INSTALLED=0 && '
249
+ # On Kubernetes, if FUSERMOUNT_SHARED_DIR is set, it means
250
+ # fusermount and fusermount3 is symlinked to fusermount-shim.
251
+ # If we reinstall fuse3, it may overwrite the symlink, so
252
+ # just install libfuse3, which is needed by blobfuse2.
253
+ 'if [ -n "${FUSERMOUNT_SHARED_DIR:-}" ]; then '
254
+ ' PACKAGES="libfuse3-3 libfuse3-dev"; '
255
+ 'else '
256
+ ' PACKAGES="fuse3 libfuse3-3 libfuse3-dev"; '
257
+ 'fi && '
258
+ 'if sudo apt-get install -y '
259
+ '-o Dpkg::Options::="--force-confdef" '
260
+ '$PACKAGES; then '
261
+ ' FUSE3_INSTALLED=1; '
262
+ ' echo "fuse3 installed from default repos"; '
143
263
  'else '
144
- ' ARCH_SUFFIX="x86_64"; '
264
+ # If fuse3 not available, try focal for Ubuntu <= 20.04
265
+ ' DISTRO=$(grep "^ID=" /etc/os-release | cut -d= -f2 | '
266
+ 'tr -d \'"\' | tr "[:upper:]" "[:lower:]") && '
267
+ ' VERSION=$(grep "^VERSION_ID=" /etc/os-release | cut -d= -f2 | '
268
+ 'tr -d \'"\') && '
269
+ ' if [ "$DISTRO" = "ubuntu" ] && '
270
+ '[ "$(echo "$VERSION 20.04" | '
271
+ 'awk \'{ print ($1 <= $2) }\')" = "1" ]; then '
272
+ ' echo "Trying to install fuse3 from focal for '
273
+ 'Ubuntu $VERSION"; '
274
+ ' echo "deb http://archive.ubuntu.com/ubuntu '
275
+ 'focal main universe" | '
276
+ 'sudo tee /etc/apt/sources.list.d/focal-fuse3.list && '
277
+ ' sudo apt-get update && '
278
+ ' if sudo apt-get install -y '
279
+ '-o Dpkg::Options::="--force-confdef" '
280
+ '-o Dpkg::Options::="--force-confold" '
281
+ '$PACKAGES; then '
282
+ ' FUSE3_INSTALLED=1; '
283
+ ' echo "fuse3 installed from focal"; '
284
+ ' sudo rm /etc/apt/sources.list.d/focal-fuse3.list; '
285
+ ' sudo apt-get update; '
286
+ ' else '
287
+ ' sudo rm -f /etc/apt/sources.list.d/focal-fuse3.list; '
288
+ ' sudo apt-get update; '
289
+ ' fi; '
290
+ ' fi; '
145
291
  'fi && '
146
- 'wget -nc https://github.com/Azure/azure-storage-fuse'
147
- f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}'
148
- f'/blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.${{ARCH_SUFFIX}}.deb '
292
+ # Install blobfuse2 only if fuse3 is available
293
+ 'if [ "$FUSE3_INSTALLED" = "1" ]; then '
294
+ ' echo "Installing blobfuse2 with libfuse3 support"; '
295
+ ' wget -nc https://github.com/Azure/azure-storage-fuse'
296
+ f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}/'
297
+ f'blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.x86_64.deb '
149
298
  '-O /tmp/blobfuse2.deb && '
150
- 'sudo dpkg --install /tmp/blobfuse2.deb && '
299
+ ' sudo dpkg --install /tmp/blobfuse2.deb; '
300
+ 'else '
301
+ ' echo "Error: libfuse3 is required for Azure storage '
302
+ 'mounting with fusermount-wrapper."; '
303
+ ' echo "libfuse3 could not be installed on this system."; '
304
+ f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
305
+ 'fi && '
151
306
  f'mkdir -p {_BLOBFUSE_CACHE_ROOT_DIR};')
152
307
 
153
308
  return install_cmd
@@ -219,7 +374,10 @@ def get_az_mount_cmd(container_name: str,
219
374
  f'-- {blobfuse2_cmd} -o nonempty --foreground {{}}')
220
375
  original = f'{blobfuse2_cmd} {blobfuse2_options} {mount_path}'
221
376
  # If fusermount-wrapper is available, use it to wrap the blobfuse2 command
222
- # to avoid requiring root privilege.
377
+ # to avoid requiring privileged containers.
378
+ # fusermount-wrapper requires libfuse3;
379
+ # we install libfuse3 even on older distros like Ubuntu 18.04 by using
380
+ # Ubuntu 20.04 (focal) repositories.
223
381
  # TODO(aylei): feeling hacky, refactor this.
224
382
  get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
225
383
  f'echo "{wrapped}" || echo "{original}"')
@@ -236,18 +394,36 @@ def get_r2_mount_cmd(r2_credentials_path: str,
236
394
  bucket_name: str,
237
395
  mount_path: str,
238
396
  _bucket_sub_path: Optional[str] = None) -> str:
239
- """Returns a command to install R2 mount utility goofys."""
397
+ """Returns a command to mount R2 bucket (goofys by default, rclone for
398
+ ARM64)."""
240
399
  if _bucket_sub_path is None:
241
400
  _bucket_sub_path = ''
242
401
  else:
243
402
  _bucket_sub_path = f':{_bucket_sub_path}'
244
- mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
245
- f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
246
- '-o allow_other '
247
- f'--stat-cache-ttl {_STAT_CACHE_TTL} '
248
- f'--type-cache-ttl {_TYPE_CACHE_TTL} '
249
- f'--endpoint {endpoint_url} '
250
- f'{bucket_name}{_bucket_sub_path} {mount_path}')
403
+
404
+ # Use rclone for ARM64 architectures since goofys doesn't support them
405
+ arch_check = 'ARCH=$(uname -m) && '
406
+ rclone_mount = (
407
+ f'{FUSE3_INSTALL_CMD} && '
408
+ f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
409
+ f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
410
+ f'AWS_PROFILE={r2_profile_name} '
411
+ f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
412
+ f'--s3-endpoint {endpoint_url} --daemon --allow-other')
413
+ goofys_mount = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
414
+ f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
415
+ '-o allow_other '
416
+ f'--stat-cache-ttl {_STAT_CACHE_TTL} '
417
+ f'--type-cache-ttl {_TYPE_CACHE_TTL} '
418
+ f'--endpoint {endpoint_url} '
419
+ f'{bucket_name}{_bucket_sub_path} {mount_path}')
420
+
421
+ mount_cmd = (f'{arch_check}'
422
+ f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
423
+ f' {rclone_mount}; '
424
+ f'else '
425
+ f' {goofys_mount}; '
426
+ f'fi')
251
427
  return mount_cmd
252
428
 
253
429
 
@@ -258,7 +434,8 @@ def get_cos_mount_cmd(rclone_config: str,
258
434
  _bucket_sub_path: Optional[str] = None) -> str:
259
435
  """Returns a command to mount an IBM COS bucket using rclone."""
260
436
  # stores bucket profile in rclone config file at the cluster's nodes.
261
- configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
437
+ configure_rclone_profile = (f'{FUSE3_INSTALL_CMD} && '
438
+ f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
262
439
  f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
263
440
  f'echo "{rclone_config}" >> '
264
441
  f'{constants.RCLONE_CONFIG_PATH}')
@@ -278,7 +455,8 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
278
455
  bucket_name: str, mount_path: str) -> str:
279
456
  """Returns a command to mount a bucket using rclone with vfs cache."""
280
457
  # stores bucket profile in rclone config file at the remote nodes.
281
- configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
458
+ configure_rclone_profile = (f'{FUSE3_INSTALL_CMD} && '
459
+ f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
282
460
  f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
283
461
  f'echo {shlex.quote(rclone_config)} >> '
284
462
  f'{constants.RCLONE_CONFIG_PATH}')
@@ -288,9 +466,9 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
288
466
  # the filename length limit.
289
467
  # The hash is a non-negative integer in string form.
290
468
  hashed_mount_path = hashlib.md5(mount_path.encode()).hexdigest()
291
- log_file_path = os.path.join(constants.RCLONE_LOG_DIR,
469
+ log_file_path = os.path.join(constants.RCLONE_MOUNT_CACHED_LOG_DIR,
292
470
  f'{hashed_mount_path}.log')
293
- create_log_cmd = (f'mkdir -p {constants.RCLONE_LOG_DIR} && '
471
+ create_log_cmd = (f'mkdir -p {constants.RCLONE_MOUNT_CACHED_LOG_DIR} && '
294
472
  f'touch {log_file_path}')
295
473
  # when mounting multiple directories with vfs cache mode, it's handled by
296
474
  # rclone to create separate cache directories at ~/.cache/rclone/vfs. It is
@@ -331,29 +509,6 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
331
509
  return mount_cmd
332
510
 
333
511
 
334
- def get_rclone_install_cmd() -> str:
335
- """ RClone installation for both apt-get and rpm.
336
- This would be common command.
337
- """
338
- # pylint: disable=line-too-long
339
- install_cmd = (
340
- 'ARCH=$(uname -m) && '
341
- 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
342
- ' ARCH_SUFFIX="arm"; '
343
- 'else '
344
- ' ARCH_SUFFIX="amd64"; '
345
- 'fi && '
346
- f'(which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || (cd ~ > /dev/null'
347
- f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
348
- f' && sudo dpkg -i rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
349
- f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb)))'
350
- f' || (which rclone > /dev/null || (cd ~ > /dev/null'
351
- f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm'
352
- f' && sudo yum --nogpgcheck install rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm -y'
353
- f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm))')
354
- return install_cmd
355
-
356
-
357
512
  def get_oci_mount_cmd(mount_path: str, store_name: str, region: str,
358
513
  namespace: str, compartment: str, config_file: str,
359
514
  config_profile: str) -> str:
@@ -433,13 +588,20 @@ def get_mounting_script(
433
588
 
434
589
  {command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}
435
590
 
436
- MOUNT_PATH={mount_path}
591
+ MOUNT_PATH=$(eval echo {mount_path})
437
592
  MOUNT_BINARY={mount_binary}
438
593
 
439
594
  # Check if path is already mounted
440
- if grep -q $MOUNT_PATH /proc/mounts ; then
595
+ if findmnt -rn -T "$MOUNT_PATH" >/dev/null 2>&1; then
441
596
  echo "Path already mounted - unmounting..."
442
- fusermount -uz "$MOUNT_PATH"
597
+ (command -v fusermount >/dev/null 2>&1 && fusermount -uz "$MOUNT_PATH") \
598
+ || (command -v fusermount3 >/dev/null 2>&1 && fusermount3 -uz "$MOUNT_PATH") \
599
+ || sudo umount -l "$MOUNT_PATH" || true
600
+ # Ensure it's really gone (avoids races)
601
+ for i in $(seq 1 20); do
602
+ if ! findmnt -rn -T "$MOUNT_PATH" >/dev/null 2>&1; then break; fi
603
+ sleep 0.2
604
+ done
443
605
  echo "Successfully unmounted $MOUNT_PATH."
444
606
  fi
445
607
 
@@ -454,17 +616,40 @@ def get_mounting_script(
454
616
  # Check if mount path exists
455
617
  if [ ! -d "$MOUNT_PATH" ]; then
456
618
  echo "Mount path $MOUNT_PATH does not exist. Creating..."
457
- sudo mkdir -p $MOUNT_PATH
458
- sudo chmod 777 $MOUNT_PATH
619
+ sudo mkdir -p "$MOUNT_PATH"
620
+ sudo chmod 777 "$MOUNT_PATH"
459
621
  else
460
- # Check if mount path contains files
461
- if [ "$(ls -A $MOUNT_PATH)" ]; then
462
- echo "Mount path $MOUNT_PATH is not empty. Please mount to another path or remove it first."
463
- exit {exceptions.MOUNT_PATH_NON_EMPTY_CODE}
464
- fi
622
+ # If not a mountpoint and contains files, clean it to satisfy SkyPilot check
623
+ if ! findmnt -rn -T "$MOUNT_PATH" >/dev/null 2>&1; then
624
+ if [ -n "$(ls -A "$MOUNT_PATH" 2>/dev/null)" ]; then
625
+ echo "Cleaning non-empty mount path before mount..."
626
+ sudo bash -lc 'shopt -s dotglob nullglob; rm -rf --one-file-system -- '"$MOUNT_PATH"'/*' 2>/dev/null || true
627
+ fi
628
+ fi
465
629
  fi
466
630
  echo "Mounting $SOURCE_BUCKET to $MOUNT_PATH with $MOUNT_BINARY..."
631
+ set +e
467
632
  {mount_cmd}
633
+ MOUNT_EXIT_CODE=$?
634
+ set -e
635
+ if [ $MOUNT_EXIT_CODE -ne 0 ]; then
636
+ echo "Mount failed with exit code $MOUNT_EXIT_CODE."
637
+ if [ "$MOUNT_BINARY" = "goofys" ]; then
638
+ echo "Looking for goofys log files..."
639
+ # Find goofys log files in /tmp (created by mktemp -t goofys.XXXX.log)
640
+ # Note: if /dev/log exists, goofys logs to syslog instead of a file
641
+ GOOFYS_LOGS=$(ls -t /tmp/goofys.*.log 2>/dev/null | head -1)
642
+ if [ -n "$GOOFYS_LOGS" ]; then
643
+ echo "=== Goofys log file contents ==="
644
+ cat "$GOOFYS_LOGS"
645
+ echo "=== End of goofys log file ==="
646
+ else
647
+ echo "No goofys log file found in /tmp"
648
+ fi
649
+ fi
650
+ # TODO(kevin): Print logs from rclone, etc too for observability.
651
+ exit $MOUNT_EXIT_CODE
652
+ fi
468
653
  echo "Mounting done."
469
654
  """)
470
655