skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -3,20 +3,30 @@ import enum
3
3
  import hashlib
4
4
  import os
5
5
  import pathlib
6
+ import re
6
7
  import shlex
8
+ import sys
7
9
  import time
8
- from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
10
+ from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
11
+ Union)
9
12
 
13
+ from sky import exceptions
10
14
  from sky import sky_logging
11
15
  from sky.skylet import constants
12
16
  from sky.skylet import log_lib
17
+ from sky.utils import auth_utils
13
18
  from sky.utils import common_utils
19
+ from sky.utils import context_utils
14
20
  from sky.utils import control_master_utils
21
+ from sky.utils import git as git_utils
15
22
  from sky.utils import subprocess_utils
16
23
  from sky.utils import timeline
17
24
 
18
25
  logger = sky_logging.init_logger(__name__)
19
26
 
27
+ # Pattern to extract home directory from command output
28
+ _HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
29
+
20
30
  # Rsync options
21
31
  # TODO(zhwu): This will print a per-file progress bar (with -P),
22
32
  # shooting a lot of messages to the output. --info=progress2 is used
@@ -36,6 +46,8 @@ RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
36
46
  # The git exclude file to support.
37
47
  GIT_EXCLUDE = '.git/info/exclude'
38
48
  RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
49
+ # Owner and group metadata is not needed for downloads.
50
+ RSYNC_NO_OWNER_NO_GROUP_OPTION = '--no-owner --no-group'
39
51
 
40
52
  _HASH_MAX_LENGTH = 10
41
53
  _DEFAULT_CONNECT_TIMEOUT = 30
@@ -175,6 +187,28 @@ class CommandRunner:
175
187
  def node_id(self) -> str:
176
188
  return '-'.join(str(x) for x in self.node)
177
189
 
190
+ def _get_remote_home_dir(self) -> str:
191
+ # Use pattern matching to extract home directory.
192
+ # Some container images print MOTD when login shells start, which can
193
+ # contaminate command output. We use a unique pattern to extract the
194
+ # actual home directory reliably.
195
+ rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
196
+ require_outputs=True,
197
+ separate_stderr=True,
198
+ stream_logs=False)
199
+ if rc != 0:
200
+ raise ValueError('Failed to get remote home directory: '
201
+ f'{output + stderr}')
202
+
203
+ # Extract home directory using pattern matching
204
+ home_dir_match = _HOME_DIR_PATTERN.search(output)
205
+ if home_dir_match:
206
+ remote_home_dir = home_dir_match.group(1)
207
+ else:
208
+ raise ValueError('Failed to find remote home directory identifier: '
209
+ f'{output + stderr}')
210
+ return remote_home_dir
211
+
178
212
  def _get_command_to_run(
179
213
  self,
180
214
  cmd: Union[str, List[str]],
@@ -182,6 +216,7 @@ class CommandRunner:
182
216
  separate_stderr: bool,
183
217
  skip_num_lines: int,
184
218
  source_bashrc: bool = False,
219
+ use_login: bool = True,
185
220
  ) -> str:
186
221
  """Returns the command to run."""
187
222
  if isinstance(cmd, list):
@@ -192,7 +227,7 @@ class CommandRunner:
192
227
  '/bin/bash',
193
228
  '--login',
194
229
  '-c',
195
- ]
230
+ ] if use_login else ['/bin/bash', '-c']
196
231
  if source_bashrc:
197
232
  command += [
198
233
  # Need this `-i` option to make sure `source ~/.bashrc` work.
@@ -226,13 +261,34 @@ class CommandRunner:
226
261
  command_str = ' '.join(command)
227
262
  return command_str
228
263
 
264
+ def _get_remote_home_dir_with_retry(
265
+ self,
266
+ max_retry: int,
267
+ get_remote_home_dir: Callable[[], str],
268
+ ) -> str:
269
+ """Returns the remote home directory with retry."""
270
+ backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
271
+ retries_left = max_retry
272
+ assert retries_left > 0, f'max_retry {max_retry} must be positive.'
273
+ while retries_left >= 0:
274
+ try:
275
+ return get_remote_home_dir()
276
+ except Exception: # pylint: disable=broad-except
277
+ if retries_left == 0:
278
+ raise
279
+ sleep_time = backoff.current_backoff()
280
+ logger.warning(f'Failed to get remote home dir '
281
+ f'- retrying in {sleep_time} seconds.')
282
+ retries_left -= 1
283
+ time.sleep(sleep_time)
284
+
229
285
  def _rsync(
230
286
  self,
231
287
  source: str,
232
288
  target: str,
233
- node_destination: str,
289
+ node_destination: Optional[str],
234
290
  up: bool,
235
- rsh_option: str,
291
+ rsh_option: Optional[str],
236
292
  # Advanced options.
237
293
  log_path: str = os.devnull,
238
294
  stream_logs: bool = True,
@@ -245,23 +301,8 @@ class CommandRunner:
245
301
  if prefix_command is not None:
246
302
  rsync_command.append(prefix_command)
247
303
  rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
248
-
249
- def _get_remote_home_dir_with_retry():
250
- backoff = common_utils.Backoff(initial_backoff=1,
251
- max_backoff_factor=5)
252
- retries_left = max_retry
253
- assert retries_left > 0, f'max_retry {max_retry} must be positive.'
254
- while retries_left >= 0:
255
- try:
256
- return get_remote_home_dir()
257
- except Exception: # pylint: disable=broad-except
258
- if retries_left == 0:
259
- raise
260
- sleep_time = backoff.current_backoff()
261
- logger.warning(f'Failed to get remote home dir '
262
- f'- retrying in {sleep_time} seconds.')
263
- retries_left -= 1
264
- time.sleep(sleep_time)
304
+ if not up:
305
+ rsync_command.append(RSYNC_NO_OWNER_NO_GROUP_OPTION)
265
306
 
266
307
  # --filter
267
308
  # The source is a local path, so we need to resolve it.
@@ -282,28 +323,47 @@ class CommandRunner:
282
323
  RSYNC_EXCLUDE_OPTION.format(
283
324
  shlex.quote(str(resolved_source / GIT_EXCLUDE))))
284
325
 
285
- rsync_command.append(f'-e {shlex.quote(rsh_option)}')
326
+ if rsh_option is not None:
327
+ rsync_command.append(f'-e {shlex.quote(rsh_option)}')
328
+ maybe_dest_prefix = ('' if node_destination is None else
329
+ f'{node_destination}:')
286
330
 
287
331
  if up:
288
332
  resolved_target = target
289
- if target.startswith('~'):
290
- remote_home_dir = _get_remote_home_dir_with_retry()
291
- resolved_target = target.replace('~', remote_home_dir)
333
+ if node_destination is None:
334
+ # Is a local rsync. Directly resolve the target.
335
+ resolved_target = str(
336
+ pathlib.Path(target).expanduser().resolve())
337
+ else:
338
+ if target.startswith('~'):
339
+ remote_home_dir = self._get_remote_home_dir_with_retry(
340
+ max_retry=max_retry,
341
+ get_remote_home_dir=get_remote_home_dir)
342
+ resolved_target = target.replace('~', remote_home_dir)
292
343
  full_source_str = str(resolved_source)
293
344
  if resolved_source.is_dir():
294
345
  full_source_str = os.path.join(full_source_str, '')
295
346
  rsync_command.extend([
296
347
  f'{full_source_str!r}',
297
- f'{node_destination}:{resolved_target!r}',
348
+ f'{maybe_dest_prefix}{resolved_target!r}',
298
349
  ])
299
350
  else:
300
351
  resolved_source = source
301
- if source.startswith('~'):
302
- remote_home_dir = _get_remote_home_dir_with_retry()
303
- resolved_source = source.replace('~', remote_home_dir)
352
+ if node_destination is None:
353
+ resolved_target = str(
354
+ pathlib.Path(target).expanduser().resolve())
355
+ resolved_source = str(
356
+ pathlib.Path(source).expanduser().resolve())
357
+ else:
358
+ resolved_target = os.path.expanduser(target)
359
+ if source.startswith('~'):
360
+ remote_home_dir = self._get_remote_home_dir_with_retry(
361
+ max_retry=max_retry,
362
+ get_remote_home_dir=get_remote_home_dir)
363
+ resolved_source = source.replace('~', remote_home_dir)
304
364
  rsync_command.extend([
305
- f'{node_destination}:{resolved_source!r}',
306
- f'{os.path.expanduser(target)!r}',
365
+ f'{maybe_dest_prefix}{resolved_source!r}',
366
+ f'{resolved_target!r}',
307
367
  ])
308
368
  command = ' '.join(rsync_command)
309
369
  logger.debug(f'Running rsync command: {command}')
@@ -367,7 +427,6 @@ class CommandRunner:
367
427
  SkyPilot but we still want to get rid of some warning messages,
368
428
  such as SSH warnings.
369
429
 
370
-
371
430
  Returns:
372
431
  returncode
373
432
  or
@@ -422,18 +481,120 @@ class CommandRunner:
422
481
  """Close the cached connection to the remote machine."""
423
482
  pass
424
483
 
425
- def port_forward_command(self,
426
- port_forward: List[Tuple[int, int]],
427
- connect_timeout: int = 1) -> List[str]:
484
+ def port_forward_command(
485
+ self,
486
+ port_forward: List[Tuple[int, int]],
487
+ connect_timeout: int = 1,
488
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
428
489
  """Command for forwarding ports from localhost to the remote machine.
429
490
 
430
491
  Args:
431
492
  port_forward: A list of ports to forward from the localhost to the
432
493
  remote host.
433
494
  connect_timeout: The timeout for the connection.
495
+ ssh_mode: The mode to use for ssh.
496
+ See SSHMode for more details.
434
497
  """
435
498
  raise NotImplementedError
436
499
 
500
+ @timeline.event
501
+ def git_clone(
502
+ self,
503
+ target_dir: str,
504
+ *,
505
+ # Advanced options.
506
+ log_path: str = os.devnull,
507
+ stream_logs: bool = True,
508
+ connect_timeout: Optional[int] = None,
509
+ max_retry: int = 1,
510
+ envs_and_secrets: Optional[Dict[str, str]] = None,
511
+ ) -> None:
512
+ """Clones a Git repository on the remote machine using git_clone.sh.
513
+
514
+ Note: Git environment variables (GIT_URL, GIT_BRANCH, GIT_TOKEN, etc.)
515
+ must be set before calling this function.
516
+
517
+ Args:
518
+ target_dir: Target directory where the repository will be cloned.
519
+ log_path: Redirect stdout/stderr to the log_path.
520
+ stream_logs: Stream logs to the stdout/stderr.
521
+ connect_timeout: timeout in seconds for the connection.
522
+ max_retry: The maximum number of retries for the rsync command.
523
+ This value should be non-negative.
524
+ envs_and_secrets: Environment variables and secrets to be set
525
+ before running the script.
526
+ Raises:
527
+ exceptions.CommandError: git clone command failed.
528
+ """
529
+ # Find the git_clone.sh script path
530
+ git_clone_script_path = os.path.join(
531
+ os.path.dirname(os.path.abspath(__file__)), 'git_clone.sh')
532
+
533
+ if not os.path.exists(git_clone_script_path):
534
+ error_msg = f'git_clone.sh {git_clone_script_path} not found'
535
+ logger.error(error_msg)
536
+ raise exceptions.CommandError(1, '', error_msg, None)
537
+
538
+ # Remote script path (use a unique name to avoid conflicts)
539
+ script_hash = hashlib.md5(
540
+ f'{self.node_id}_{target_dir}'.encode()).hexdigest()[:8]
541
+ remote_script_path = f'/tmp/sky_git_clone_{script_hash}.sh'
542
+
543
+ # Step 1: Transfer the script to remote machine using rsync
544
+ logger.debug(
545
+ f'Transferring git_clone.sh to {self.node_id}:{remote_script_path}')
546
+ self.rsync(
547
+ source=git_clone_script_path,
548
+ target=remote_script_path,
549
+ up=True,
550
+ log_path=log_path,
551
+ stream_logs=False # Don't spam logs for script transfer
552
+ )
553
+
554
+ # Step 2: Execute the script on remote machine
555
+ if target_dir.startswith('~'):
556
+ remote_home_dir = self._get_remote_home_dir_with_retry(
557
+ max_retry=max_retry,
558
+ get_remote_home_dir=self._get_remote_home_dir)
559
+ target_dir = target_dir.replace('~', remote_home_dir)
560
+ quoted_target_dir = shlex.quote(target_dir)
561
+ quoted_script_path = shlex.quote(remote_script_path)
562
+ cmd = ''
563
+ log_cmd = ''
564
+ if envs_and_secrets:
565
+ for key, value in envs_and_secrets.items():
566
+ value = shlex.quote(value)
567
+ cmd += f'export {key}={value} && '
568
+ if (key == git_utils.GIT_TOKEN_ENV_VAR or
569
+ key == git_utils.GIT_SSH_KEY_ENV_VAR):
570
+ log_cmd += f'export {key}=******** && '
571
+ else:
572
+ log_cmd += f'export {key}={value} && '
573
+ exec_cmd = (f'bash {quoted_script_path} {quoted_target_dir} '
574
+ f'&& rm -f {quoted_script_path}')
575
+ cmd += exec_cmd
576
+ log_cmd += exec_cmd
577
+
578
+ logger.debug(f'Running git clone script on {self.node_id}: {log_cmd}')
579
+
580
+ backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
581
+ assert max_retry > 0, f'max_retry {max_retry} must be positive.'
582
+ while max_retry >= 0:
583
+ returncode = self.run(cmd,
584
+ log_path=log_path,
585
+ stream_logs=stream_logs,
586
+ connect_timeout=connect_timeout,
587
+ require_outputs=False)
588
+ if returncode == 0:
589
+ break
590
+ max_retry -= 1
591
+ time.sleep(backoff.current_backoff())
592
+
593
+ if returncode != 0:
594
+ error_msg = f'Git clone failed on {self.node_id}: {target_dir}'
595
+ logger.error(error_msg)
596
+ raise exceptions.CommandError(returncode, log_cmd, error_msg, None)
597
+
437
598
 
438
599
  class SSHCommandRunner(CommandRunner):
439
600
  """Runner for SSH commands."""
@@ -447,6 +608,7 @@ class SSHCommandRunner(CommandRunner):
447
608
  ssh_proxy_command: Optional[str] = None,
448
609
  docker_user: Optional[str] = None,
449
610
  disable_control_master: Optional[bool] = False,
611
+ port_forward_execute_remote_command: Optional[bool] = False,
450
612
  ):
451
613
  """Initialize SSHCommandRunner.
452
614
 
@@ -473,6 +635,10 @@ class SSHCommandRunner(CommandRunner):
473
635
  disable_control_master: bool; specifies either or not the ssh
474
636
  command will utilize ControlMaster. We currently disable
475
637
  it for k8s instance.
638
+ port_forward_execute_remote_command: bool; specifies whether to
639
+ add -N to the port forwarding command. This is useful if you
640
+ want to run a command on the remote machine to make sure the
641
+ SSH tunnel is established.
476
642
  """
477
643
  super().__init__(node)
478
644
  ip, port = node
@@ -484,39 +650,63 @@ class SSHCommandRunner(CommandRunner):
484
650
  self.disable_control_master = (
485
651
  disable_control_master or
486
652
  control_master_utils.should_disable_control_master())
653
+ # ensure the ssh key files are created from the database
654
+ auth_utils.create_ssh_key_files_from_db(ssh_private_key)
487
655
  if docker_user is not None:
488
656
  assert port is None or port == 22, (
489
657
  f'port must be None or 22 for docker_user, got {port}.')
490
- # Already checked in resources
491
- assert ssh_proxy_command is None, (
492
- 'ssh_proxy_command is not supported when using docker.')
658
+ # When connecting via docker, the outer SSH hop points to the
659
+ # container's sshd (localhost). Preserve the user proxy for the
660
+ # inner hop that reaches the host VM, and clear the outer proxy to
661
+ # avoid forwarding localhost through the jump host.
662
+ inner_proxy_command = ssh_proxy_command
663
+ inner_proxy_port = port or 22
664
+ self._ssh_proxy_command = None
493
665
  self.ip = 'localhost'
494
666
  self.ssh_user = docker_user
495
667
  self.port = constants.DEFAULT_DOCKER_PORT
668
+ if inner_proxy_command is not None:
669
+ # Replace %h/%p placeholders with actual host values, since the
670
+ # final destination from the perspective of the user proxy is
671
+ # the host VM (ip, inner_proxy_port).
672
+ inner_proxy_command = inner_proxy_command.replace('%h', ip)
673
+ inner_proxy_command = inner_proxy_command.replace(
674
+ '%p', str(inner_proxy_port))
496
675
  self._docker_ssh_proxy_command = lambda ssh: ' '.join(
497
- ssh + ssh_options_list(ssh_private_key, None
498
- ) + ['-W', '%h:%p', f'{ssh_user}@{ip}'])
676
+ ssh + ssh_options_list(ssh_private_key,
677
+ None,
678
+ ssh_proxy_command=inner_proxy_command,
679
+ port=inner_proxy_port,
680
+ disable_control_master=self.
681
+ disable_control_master) +
682
+ ['-W', '%h:%p', f'{ssh_user}@{ip}'])
499
683
  else:
500
684
  self.ip = ip
501
685
  self.ssh_user = ssh_user
502
686
  self.port = port
503
687
  self._docker_ssh_proxy_command = None
688
+ self.port_forward_execute_remote_command = (
689
+ port_forward_execute_remote_command)
504
690
 
505
- def port_forward_command(self,
506
- port_forward: List[Tuple[int, int]],
507
- connect_timeout: int = 1) -> List[str]:
691
+ def port_forward_command(
692
+ self,
693
+ port_forward: List[Tuple[int, int]],
694
+ connect_timeout: int = 1,
695
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
508
696
  """Command for forwarding ports from localhost to the remote machine.
509
697
 
510
698
  Args:
511
699
  port_forward: A list of ports to forward from the local port to the
512
700
  remote port.
513
701
  connect_timeout: The timeout for the ssh connection.
702
+ ssh_mode: The mode to use for ssh.
703
+ See SSHMode for more details.
514
704
 
515
705
  Returns:
516
706
  The command for forwarding ports from localhost to the remote
517
707
  machine.
518
708
  """
519
- return self.ssh_base_command(ssh_mode=SshMode.INTERACTIVE,
709
+ return self.ssh_base_command(ssh_mode=ssh_mode,
520
710
  port_forward=port_forward,
521
711
  connect_timeout=connect_timeout)
522
712
 
@@ -533,9 +723,13 @@ class SSHCommandRunner(CommandRunner):
533
723
  ssh += ['-tt']
534
724
  if port_forward is not None:
535
725
  for local, remote in port_forward:
536
- logger.info(
726
+ logger.debug(
537
727
  f'Forwarding local port {local} to remote port {remote}.')
538
- ssh += ['-NL', f'{local}:localhost:{remote}']
728
+ if self.port_forward_execute_remote_command:
729
+ ssh += ['-L']
730
+ else:
731
+ ssh += ['-NL']
732
+ ssh += [f'{local}:localhost:{remote}']
539
733
  if self._docker_ssh_proxy_command is not None:
540
734
  docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
541
735
  else:
@@ -560,7 +754,7 @@ class SSHCommandRunner(CommandRunner):
560
754
  if self.ssh_control_name is not None:
561
755
  control_path = _ssh_control_path(self.ssh_control_name)
562
756
  if control_path is not None:
563
- # Suppress the `Exit request sent.` output for this comamnd
757
+ # Suppress the `Exit request sent.` output for this command
564
758
  # which would interrupt the CLI spinner.
565
759
  cmd = (f'ssh -O exit -S {control_path}/%C '
566
760
  f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
@@ -574,6 +768,7 @@ class SSHCommandRunner(CommandRunner):
574
768
  shell=True)
575
769
 
576
770
  @timeline.event
771
+ @context_utils.cancellation_guard
577
772
  def run(
578
773
  self,
579
774
  cmd: Union[str, List[str]],
@@ -748,9 +943,11 @@ class KubernetesCommandRunner(CommandRunner):
748
943
  else:
749
944
  return f'pod/{self.pod_name}'
750
945
 
751
- def port_forward_command(self,
752
- port_forward: List[Tuple[int, int]],
753
- connect_timeout: int = 1) -> List[str]:
946
+ def port_forward_command(
947
+ self,
948
+ port_forward: List[Tuple[int, int]],
949
+ connect_timeout: int = 1,
950
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
754
951
  """Command for forwarding ports from localhost to the remote machine.
755
952
 
756
953
  Args:
@@ -758,14 +955,25 @@ class KubernetesCommandRunner(CommandRunner):
758
955
  remote port. Currently, only one port is supported, i.e. the
759
956
  list should have only one element.
760
957
  connect_timeout: The timeout for the ssh connection.
958
+ ssh_mode: The mode to use for ssh.
959
+ See SSHMode for more details.
761
960
  """
961
+ del ssh_mode # unused
762
962
  assert port_forward and len(port_forward) == 1, (
763
963
  'Only one port is supported for Kubernetes port-forward.')
764
964
  kubectl_args = [
765
965
  '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
766
966
  ]
967
+ # The same logic to either set `--context` to the k8s context where
968
+ # the sky cluster is hosted, or `--kubeconfig` to /dev/null for
969
+ # in-cluster k8s is used below in the `run()` method.
767
970
  if self.context:
768
971
  kubectl_args += ['--context', self.context]
972
+ # If context is none, it means the cluster is hosted on in-cluster k8s.
973
+ # In this case, we need to set KUBECONFIG to /dev/null to avoid looking
974
+ # for the cluster in whatever active context is set in the kubeconfig.
975
+ else:
976
+ kubectl_args += ['--kubeconfig', '/dev/null']
769
977
  local_port, remote_port = port_forward[0]
770
978
  local_port_str = f'{local_port}' if local_port is not None else ''
771
979
 
@@ -779,6 +987,7 @@ class KubernetesCommandRunner(CommandRunner):
779
987
  return kubectl_cmd
780
988
 
781
989
  @timeline.event
990
+ @context_utils.cancellation_guard
782
991
  def run(
783
992
  self,
784
993
  cmd: Union[str, List[str]],
@@ -820,7 +1029,6 @@ class KubernetesCommandRunner(CommandRunner):
820
1029
  SkyPilot but we still want to get rid of some warning messages,
821
1030
  such as SSH warnings.
822
1031
 
823
-
824
1032
  Returns:
825
1033
  returncode
826
1034
  or
@@ -922,23 +1130,10 @@ class KubernetesCommandRunner(CommandRunner):
922
1130
  exceptions.CommandError: rsync command failed.
923
1131
  """
924
1132
 
925
- def get_remote_home_dir() -> str:
926
- # Use `echo ~` to get the remote home directory, instead of pwd or
927
- # echo $HOME, because pwd can be `/` when the remote user is root
928
- # and $HOME is not always set.
929
- rc, remote_home_dir, stderr = self.run('echo ~',
930
- require_outputs=True,
931
- separate_stderr=True,
932
- stream_logs=False)
933
- if rc != 0:
934
- raise ValueError('Failed to get remote home directory: '
935
- f'{remote_home_dir + stderr}')
936
- remote_home_dir = remote_home_dir.strip()
937
- return remote_home_dir
938
-
939
1133
  # Build command.
940
- helper_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
941
- 'kubernetes', 'rsync_helper.sh')
1134
+ helper_path = shlex.quote(
1135
+ os.path.join(os.path.abspath(os.path.dirname(__file__)),
1136
+ 'kubernetes', 'rsync_helper.sh'))
942
1137
  namespace_context = f'{self.namespace}+{self.context}'
943
1138
  # Avoid rsync interpreting :, /, and + in namespace_context as the
944
1139
  # default delimiter for options and arguments.
@@ -960,4 +1155,95 @@ class KubernetesCommandRunner(CommandRunner):
960
1155
  # rsync with `kubectl` as the rsh command will cause ~/xx parsed as
961
1156
  # /~/xx, so we need to replace ~ with the remote home directory. We
962
1157
  # only need to do this when ~ is at the beginning of the path.
963
- get_remote_home_dir=get_remote_home_dir)
1158
+ get_remote_home_dir=self._get_remote_home_dir)
1159
+
1160
+
1161
+ class LocalProcessCommandRunner(CommandRunner):
1162
+ """Runner for local process commands."""
1163
+
1164
+ def __init__(self):
1165
+ super().__init__('local')
1166
+
1167
+ @timeline.event
1168
+ @context_utils.cancellation_guard
1169
+ def run(
1170
+ self,
1171
+ cmd: Union[str, List[str]],
1172
+ *,
1173
+ require_outputs: bool = False,
1174
+ port_forward: Optional[List[Tuple[int, int]]] = None,
1175
+ # Advanced options.
1176
+ log_path: str = os.devnull,
1177
+ # If False, do not redirect stdout/stderr to optimize performance.
1178
+ process_stream: bool = True,
1179
+ stream_logs: bool = True,
1180
+ ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
1181
+ separate_stderr: bool = False,
1182
+ connect_timeout: Optional[int] = None,
1183
+ source_bashrc: bool = False,
1184
+ skip_num_lines: int = 0,
1185
+ **kwargs) -> Union[int, Tuple[int, str, str]]:
1186
+ """Use subprocess to run the command."""
1187
+ del port_forward, ssh_mode, connect_timeout # Unused.
1188
+
1189
+ command_str = self._get_command_to_run(cmd,
1190
+ process_stream,
1191
+ separate_stderr,
1192
+ skip_num_lines=skip_num_lines,
1193
+ source_bashrc=source_bashrc,
1194
+ use_login=False)
1195
+
1196
+ log_dir = os.path.expanduser(os.path.dirname(log_path))
1197
+ os.makedirs(log_dir, exist_ok=True)
1198
+
1199
+ executable = None
1200
+ command = [command_str]
1201
+ if not process_stream:
1202
+ if stream_logs:
1203
+ command += [
1204
+ f'| tee {log_path}',
1205
+ # This also requires the executor to be '/bin/bash' instead
1206
+ # of the default '/bin/sh'.
1207
+ '; exit ${PIPESTATUS[0]}'
1208
+ ]
1209
+ else:
1210
+ command += [f'> {log_path}']
1211
+ executable = '/bin/bash'
1212
+ command_str = ' '.join(command)
1213
+ # For local process, the API server might not have this python path
1214
+ # setup. But this command runner should only be triggered from the API
1215
+ # server (in controller consolidation mode), so we can safely replace
1216
+ # the python path with the executable of the API server.
1217
+ command_str = command_str.replace(constants.SKY_PYTHON_CMD,
1218
+ sys.executable)
1219
+ logger.debug(f'Running command locally: {command_str}')
1220
+ return log_lib.run_with_log(command_str,
1221
+ log_path,
1222
+ require_outputs=require_outputs,
1223
+ stream_logs=stream_logs,
1224
+ process_stream=process_stream,
1225
+ shell=True,
1226
+ executable=executable,
1227
+ **kwargs)
1228
+
1229
+ @timeline.event
1230
+ def rsync(
1231
+ self,
1232
+ source: str,
1233
+ target: str,
1234
+ *,
1235
+ up: bool,
1236
+ # Advanced options.
1237
+ log_path: str = os.devnull,
1238
+ stream_logs: bool = True,
1239
+ max_retry: int = 1,
1240
+ ) -> None:
1241
+ """Use rsync to sync the source to the target."""
1242
+ self._rsync(source,
1243
+ target,
1244
+ node_destination=None,
1245
+ up=up,
1246
+ rsh_option=None,
1247
+ log_path=log_path,
1248
+ stream_logs=stream_logs,
1249
+ max_retry=max_retry)