skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py CHANGED
@@ -4,14 +4,17 @@ This is a remote utility module that provides logging functionality.
4
4
  """
5
5
  import collections
6
6
  import copy
7
+ import functools
7
8
  import io
8
9
  import multiprocessing.pool
9
10
  import os
11
+ import queue as queue_lib
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
14
16
  import textwrap
17
+ import threading
15
18
  import time
16
19
  from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
17
20
  Tuple, Union)
@@ -21,6 +24,8 @@ import colorama
21
24
  from sky import sky_logging
22
25
  from sky.skylet import constants
23
26
  from sky.skylet import job_lib
27
+ from sky.utils import context
28
+ from sky.utils import context_utils
24
29
  from sky.utils import log_utils
25
30
  from sky.utils import subprocess_utils
26
31
  from sky.utils import ux_utils
@@ -36,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
36
41
 
37
42
  LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
38
43
 
44
+ # 16-64KiB seems to be the sweet spot:
45
+ # https://github.com/grpc/grpc.github.io/issues/371
46
+ # TODO(kevin): Benchmark this ourselves and verify.
47
+ DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
48
+
39
49
 
40
50
  class _ProcessingArgs:
41
51
  """Arguments for processing logs."""
@@ -59,6 +69,16 @@ class _ProcessingArgs:
59
69
  self.streaming_prefix = streaming_prefix
60
70
 
61
71
 
72
+ def _get_context():
73
+ # TODO(aylei): remove this after we drop the backward-compatibility for
74
+ # 0.9.x in 0.12.0
75
+ # Keep backward-compatibility for the old version of SkyPilot runtimes.
76
+ if 'context' in globals():
77
+ return context.get()
78
+ else:
79
+ return None
80
+
81
+
62
82
  def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
63
83
  """Process the stream of a process."""
64
84
  out_io = io.TextIOWrapper(io_stream,
@@ -77,6 +97,9 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
77
97
  with open(args.log_path, 'a', encoding='utf-8') as fout:
78
98
  with line_processor:
79
99
  while True:
100
+ ctx = _get_context()
101
+ if ctx is not None and ctx.is_canceled():
102
+ return
80
103
  line = out_io.readline()
81
104
  if not line:
82
105
  break
@@ -111,26 +134,24 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
111
134
  return ''.join(out)
112
135
 
113
136
 
114
- def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
115
- """Redirect the process's filtered stdout/stderr to both stream and file"""
137
+ def process_subprocess_stream(proc, stdout_stream_handler,
138
+ stderr_stream_handler) -> Tuple[str, str]:
139
+ """Process the stream of a process in threads, blocking."""
116
140
  if proc.stderr is not None:
117
141
  # Asyncio does not work as the output processing can be executed in a
118
142
  # different thread.
119
143
  # selectors is possible to handle the multiplexing of stdout/stderr,
120
144
  # but it introduces buffering making the output not streaming.
121
145
  with multiprocessing.pool.ThreadPool(processes=1) as pool:
122
- err_args = copy.copy(args)
123
- err_args.line_processor = None
124
- stderr_fut = pool.apply_async(_handle_io_stream,
125
- args=(proc.stderr, sys.stderr,
126
- err_args))
146
+ stderr_fut = pool.apply_async(stderr_stream_handler,
147
+ args=(proc.stderr, sys.stderr))
127
148
  # Do not launch a thread for stdout as the rich.status does not
128
149
  # work in a thread, which is used in
129
150
  # log_utils.RayUpLineProcessor.
130
- stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
151
+ stdout = stdout_stream_handler(proc.stdout, sys.stdout)
131
152
  stderr = stderr_fut.get()
132
153
  else:
133
- stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
154
+ stdout = stdout_stream_handler(proc.stdout, sys.stdout)
134
155
  stderr = ''
135
156
  return stdout, stderr
136
157
 
@@ -176,7 +197,12 @@ def run_with_log(
176
197
  # Redirect stderr to stdout when using ray, to preserve the order of
177
198
  # stdout and stderr.
178
199
  stdout_arg = stderr_arg = None
179
- if process_stream:
200
+ ctx = _get_context()
201
+ if process_stream or ctx is not None:
202
+ # Capture stdout/stderr of the subprocess if:
203
+ # 1. Post-processing is needed (process_stream=True)
204
+ # 2. Potential contextual handling is needed (ctx is not None)
205
+ # TODO(aylei): can we always capture the stdout/stderr?
180
206
  stdout_arg = subprocess.PIPE
181
207
  stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
182
208
  # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
@@ -194,9 +220,18 @@ def run_with_log(
194
220
  stdin=stdin,
195
221
  **kwargs) as proc:
196
222
  try:
197
- subprocess_utils.kill_process_daemon(proc.pid)
223
+ if ctx is not None:
224
+ # When runs in coroutine, use kill_pg if available to avoid
225
+ # the overhead of refreshing the process tree in the daemon.
226
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
227
+ else:
228
+ # For backward compatibility, do not specify use_kill_pg by
229
+ # default.
230
+ subprocess_utils.kill_process_daemon(proc.pid)
198
231
  stdout = ''
199
232
  stderr = ''
233
+ stdout_stream_handler = None
234
+ stderr_stream_handler = None
200
235
 
201
236
  if process_stream:
202
237
  if skip_lines is None:
@@ -223,7 +258,34 @@ def run_with_log(
223
258
  replace_crlf=with_ray,
224
259
  streaming_prefix=streaming_prefix,
225
260
  )
226
- stdout, stderr = process_subprocess_stream(proc, args)
261
+ stdout_stream_handler = functools.partial(
262
+ _handle_io_stream,
263
+ args=args,
264
+ )
265
+ if proc.stderr is not None:
266
+ err_args = copy.copy(args)
267
+ err_args.line_processor = None
268
+ stderr_stream_handler = functools.partial(
269
+ _handle_io_stream,
270
+ args=err_args,
271
+ )
272
+ if ctx is not None:
273
+ # When runs in a coroutine, always process the subprocess
274
+ # stream to:
275
+ # 1. handle context cancellation
276
+ # 2. redirect subprocess stdout/stderr to the contextual
277
+ # stdout/stderr of current coroutine.
278
+ stdout, stderr = context_utils.pipe_and_wait_process(
279
+ ctx,
280
+ proc,
281
+ stdout_stream_handler=stdout_stream_handler,
282
+ stderr_stream_handler=stderr_stream_handler)
283
+ elif process_stream:
284
+ # When runs in a process, only process subprocess stream if
285
+ # necessary to avoid unnecessary stream handling overhead.
286
+ stdout, stderr = process_subprocess_stream(
287
+ proc, stdout_stream_handler, stderr_stream_handler)
288
+ # Ensure returncode is set.
227
289
  proc.wait()
228
290
  if require_outputs:
229
291
  return proc.returncode, stdout, stderr
@@ -305,6 +367,17 @@ def run_bash_command_with_log(bash_command: str,
305
367
  shell=True)
306
368
 
307
369
 
370
+ def run_bash_command_with_log_and_return_pid(
371
+ bash_command: str,
372
+ log_path: str,
373
+ env_vars: Optional[Dict[str, str]] = None,
374
+ stream_logs: bool = False,
375
+ with_ray: bool = False):
376
+ return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
377
+ stream_logs, with_ray)
378
+ return {'return_code': return_code, 'pid': os.getpid()}
379
+
380
+
308
381
  def _follow_job_logs(file,
309
382
  job_id: int,
310
383
  start_streaming: bool,
@@ -346,9 +419,9 @@ def _follow_job_logs(file,
346
419
  wait_last_logs = False
347
420
  continue
348
421
  status_str = status.value if status is not None else 'None'
349
- print(ux_utils.finishing_message(
350
- f'Job finished (status: {status_str}).'),
351
- flush=True)
422
+ finish = ux_utils.finishing_message(
423
+ f'Job finished (status: {status_str}).')
424
+ yield finish + '\n'
352
425
  return
353
426
 
354
427
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
@@ -495,9 +568,215 @@ def tail_logs(job_id: Optional[int],
495
568
  if start_streaming:
496
569
  print(line, end='', flush=True)
497
570
  status_str = status.value if status is not None else 'None'
498
- print(ux_utils.finishing_message(
499
- f'Job finished (status: {status_str}).'),
500
- flush=True)
571
+ # Only show "Job finished" for actually terminal states
572
+ if status is not None and status.is_terminal():
573
+ print(ux_utils.finishing_message(
574
+ f'Job finished (status: {status_str}).'),
575
+ flush=True)
501
576
  except FileNotFoundError:
502
577
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
503
578
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
579
+
580
+
581
+ def tail_logs_iter(job_id: Optional[int],
582
+ log_dir: Optional[str],
583
+ managed_job_id: Optional[int] = None,
584
+ follow: bool = True,
585
+ tail: int = 0) -> Iterator[str]:
586
+ """Tail the logs of a job. This is mostly the same as tail_logs, but
587
+ returns an iterator instead of printing to stdout/stderr."""
588
+ if job_id is None:
589
+ # This only happens when job_lib.get_latest_job_id() returns None,
590
+ # which means no job has been submitted to this cluster. See
591
+ # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
592
+ logger.info('Skip streaming logs as no job has been submitted.')
593
+ return
594
+ job_str = f'job {job_id}'
595
+ if managed_job_id is not None:
596
+ job_str = f'managed job {managed_job_id}'
597
+ if log_dir is None:
598
+ msg = f'{job_str.capitalize()} not found (see `sky queue`).'
599
+ yield msg + '\n'
600
+ return
601
+ logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
602
+ f'{managed_job_id}.')
603
+ log_path = os.path.join(log_dir, 'run.log')
604
+ log_path = os.path.expanduser(log_path)
605
+
606
+ status = job_lib.update_job_status([job_id], silent=True)[0]
607
+
608
+ # Wait for the log to be written. This is needed due to the `ray submit`
609
+ # will take some time to start the job and write the log.
610
+ retry_cnt = 0
611
+ while status is not None and not status.is_terminal():
612
+ retry_cnt += 1
613
+ if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
614
+ break
615
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
616
+ err = (f'{colorama.Fore.RED}ERROR: Logs for '
617
+ f'{job_str} (status: {status.value}) does not exist '
618
+ f'after retrying {retry_cnt} times.'
619
+ f'{colorama.Style.RESET_ALL}')
620
+ yield err + '\n'
621
+ return
622
+ waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
623
+ 'to be written...')
624
+ yield waiting + '\n'
625
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
626
+ status = job_lib.update_job_status([job_id], silent=True)[0]
627
+
628
+ start_stream_at = LOG_FILE_START_STREAMING_AT
629
+ # Explicitly declare the type to avoid mypy warning.
630
+ lines: Iterable[str] = []
631
+ if follow and status in [
632
+ job_lib.JobStatus.SETTING_UP,
633
+ job_lib.JobStatus.PENDING,
634
+ job_lib.JobStatus.RUNNING,
635
+ ]:
636
+ # Not using `ray job logs` because it will put progress bar in
637
+ # multiple lines.
638
+ with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
639
+ # Using `_follow` instead of `tail -f` to streaming the whole
640
+ # log and creating a new process for tail.
641
+ start_streaming = False
642
+ if tail > 0:
643
+ head_lines_of_log_file = _peek_head_lines(log_file)
644
+ lines = collections.deque(log_file, maxlen=tail)
645
+ start_streaming = _should_stream_the_whole_tail_lines(
646
+ head_lines_of_log_file, lines, start_stream_at)
647
+ for line in lines:
648
+ if start_stream_at in line:
649
+ start_streaming = True
650
+ if start_streaming:
651
+ yield line
652
+ # Now, the cursor is at the end of the last lines
653
+ # if tail > 0
654
+ for line in _follow_job_logs(log_file,
655
+ job_id=job_id,
656
+ start_streaming=start_streaming,
657
+ start_streaming_at=start_stream_at):
658
+ yield line
659
+ else:
660
+ try:
661
+ start_streaming = False
662
+ with open(log_path, 'r', encoding='utf-8') as log_file:
663
+ if tail > 0:
664
+ # If tail > 0, we need to read the last n lines.
665
+ # We use double ended queue to rotate the last n lines.
666
+ head_lines_of_log_file = _peek_head_lines(log_file)
667
+ lines = collections.deque(log_file, maxlen=tail)
668
+ start_streaming = _should_stream_the_whole_tail_lines(
669
+ head_lines_of_log_file, lines, start_stream_at)
670
+ else:
671
+ lines = log_file
672
+ for line in lines:
673
+ if start_stream_at in line:
674
+ start_streaming = True
675
+ if start_streaming:
676
+ yield line
677
+ status_str = status.value if status is not None else 'None'
678
+ # Only show "Job finished" for actually terminal states
679
+ if status is not None and status.is_terminal():
680
+ finish = ux_utils.finishing_message(
681
+ f'Job finished (status: {status_str}).')
682
+ yield finish + '\n'
683
+ return
684
+ except FileNotFoundError:
685
+ err = (
686
+ f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
687
+ f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
688
+ yield err + '\n'
689
+
690
+
691
+ class LogBuffer:
692
+ """In-memory buffer for chunking log lines for streaming."""
693
+
694
+ def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
695
+ """Initialize the log buffer.
696
+
697
+ Args:
698
+ max_chars: Maximum buffer size (in characters, not bytes) before
699
+ flushing. The actual amount of bytes (UTF-8 encoding)
700
+ could be more than this, depending on the characters,
701
+ i.e. ASCII characters take 1 byte, while others
702
+ may take 2-4 bytes. But this is fine as our default
703
+ chunk size is well below the default value of
704
+ grpc.max_receive_message_length which is 4MB.
705
+ """
706
+ self.max_chars = max_chars
707
+ self._buffer = io.StringIO()
708
+
709
+ def _should_flush(self) -> bool:
710
+ return self._buffer.tell() >= self.max_chars
711
+
712
+ def flush(self) -> str:
713
+ """Get the current buffered content and clear the buffer.
714
+
715
+ Returns:
716
+ The buffered log lines as a single string
717
+ """
718
+ if not self._buffer.tell():
719
+ return ''
720
+ chunk = self._buffer.getvalue()
721
+ self._buffer.truncate(0)
722
+ self._buffer.seek(0)
723
+ return chunk
724
+
725
+ def write(self, line: str) -> bool:
726
+ """Add a line to the buffer.
727
+
728
+ Args:
729
+ line: The log line to add
730
+
731
+ Returns:
732
+ True if buffer should be flushed after adding the line
733
+ """
734
+ self._buffer.write(line)
735
+ return self._should_flush()
736
+
737
+ def close(self):
738
+ self._buffer.close()
739
+
740
+
741
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
742
+ timeout: float) -> Iterable[str]:
743
+ """Iterates over an iterable, writing each item to a buffer,
744
+ and flushing the buffer when it is full or no item is
745
+ yielded within the timeout duration."""
746
+ # TODO(kevin): Simplify this using asyncio.timeout, once we move
747
+ # the skylet event loop and gRPC server to asyncio.
748
+ # https://docs.python.org/3/library/asyncio-task.html#timeouts
749
+
750
+ queue: queue_lib.Queue = queue_lib.Queue()
751
+ sentinel = object()
752
+
753
+ def producer():
754
+ try:
755
+ for item in iterable:
756
+ queue.put(item)
757
+ finally:
758
+ queue.put(sentinel)
759
+
760
+ thread = threading.Thread(target=producer, daemon=True)
761
+ thread.start()
762
+
763
+ while True:
764
+ try:
765
+ item = queue.get(timeout=timeout)
766
+ except queue_lib.Empty:
767
+ out = buffer.flush()
768
+ if out:
769
+ yield out
770
+ continue
771
+
772
+ if item is sentinel:
773
+ thread.join()
774
+ out = buffer.flush()
775
+ if out:
776
+ yield out
777
+ return
778
+
779
+ if buffer.write(item):
780
+ out = buffer.flush()
781
+ if out:
782
+ yield out
sky/skylet/log_lib.pyi CHANGED
@@ -4,13 +4,14 @@ overloaded type hints for run_with_log(), as we need to determine
4
4
  the return type based on the value of require_outputs.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple, Union
7
+ from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
8
8
 
9
9
  from typing_extensions import Literal
10
10
 
11
11
  from sky import sky_logging as sky_logging
12
12
  from sky.skylet import constants as constants
13
13
  from sky.skylet import job_lib as job_lib
14
+ from sky.utils import context
14
15
  from sky.utils import log_utils as log_utils
15
16
 
16
17
  SKY_LOG_WAITING_GAP_SECONDS: int = ...
@@ -41,6 +42,10 @@ class _ProcessingArgs:
41
42
  ...
42
43
 
43
44
 
45
+ def _get_context() -> Optional[context.SkyPilotContext]:
46
+ ...
47
+
48
+
44
49
  def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
45
50
  ...
46
51
 
@@ -124,8 +129,46 @@ def run_bash_command_with_log(bash_command: str,
124
129
  ...
125
130
 
126
131
 
132
+ def run_bash_command_with_log_and_return_pid(
133
+ bash_command: str,
134
+ log_path: str,
135
+ env_vars: Optional[Dict[str, str]] = ...,
136
+ stream_logs: bool = ...,
137
+ with_ray: bool = ...):
138
+ ...
139
+
140
+
127
141
  def tail_logs(job_id: int,
128
142
  log_dir: Optional[str],
129
143
  managed_job_id: Optional[int] = ...,
130
144
  follow: bool = ...) -> None:
131
145
  ...
146
+
147
+
148
+ def tail_logs_iter(job_id: Optional[int],
149
+ log_dir: Optional[str],
150
+ managed_job_id: Optional[int] = ...,
151
+ follow: bool = ...,
152
+ tail: int = ...) -> Iterator[str]:
153
+ ...
154
+
155
+
156
+ class LogBuffer:
157
+ max_chars: int
158
+
159
+ def __init__(self, max_chars: int = ...):
160
+ ...
161
+
162
+ def flush(self) -> str:
163
+ ...
164
+
165
+ def write(self, line: str) -> bool:
166
+ ...
167
+
168
+ def close(self):
169
+ ...
170
+
171
+
172
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
173
+ timeout: float) -> Iterable[str]:
174
+ ...
@@ -40,15 +40,29 @@ def _run_patch(target_file,
40
40
  """Applies a patch if it has not been applied already."""
41
41
  # .orig is the original file that is not patched.
42
42
  orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
43
+ # Get diff filename by replacing .patch with .diff
44
+ diff_file = patch_file.replace('.patch', '.diff')
45
+
43
46
  script = f"""\
44
47
  which patch >/dev/null 2>&1 || sudo yum install -y patch || true
45
- which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
46
48
  if [ ! -f {orig_file} ]; then
47
49
  echo Create backup file {orig_file}
48
50
  cp {target_file} {orig_file}
49
51
  fi
50
- # It is ok to patch again from the original file.
51
- patch {orig_file} -i {patch_file} -o {target_file}
52
+ if which patch >/dev/null 2>&1; then
53
+ # System patch command is available, use it
54
+ # It is ok to patch again from the original file.
55
+ patch {orig_file} -i {patch_file} -o {target_file}
56
+ else
57
+ # System patch command not available, use Python patch library
58
+ echo "System patch command not available, using Python patch library..."
59
+ python -m pip install patch
60
+ # Get target directory
61
+ target_dir="$(dirname {target_file})"
62
+ # Execute python patch command
63
+ echo "Executing python -m patch -d $target_dir {diff_file}"
64
+ python -m patch -d "$target_dir" "{diff_file}"
65
+ fi
52
66
  """
53
67
  subprocess.run(script, shell=True, check=True)
54
68
 
@@ -0,0 +1,18 @@
1
+ --- a/autoscaler.py
2
+ +++ b/autoscaler.py
3
+ @@ -1,3 +1,6 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
5
+ +# Sky patch changes:
6
+ +# - enable upscaling_speed to be 0.0
7
+ import copy
8
+ import logging
9
+ import math
10
+ @@ -1071,7 +1074,7 @@
11
+ upscaling_speed = self.config.get("upscaling_speed")
12
+ aggressive = self.config.get("autoscaling_mode") == "aggressive"
13
+ target_utilization_fraction = self.config.get("target_utilization_fraction")
14
+ - if upscaling_speed:
15
+ + if upscaling_speed is not None: # NOTE(sky): enable 0.0
16
+ upscaling_speed = float(upscaling_speed)
17
+ # TODO(ameer): consider adding (if users ask) an option of
18
+ # initial_upscaling_num_workers.
@@ -0,0 +1,19 @@
1
+ --- a/cli.py
2
+ +++ b/cli.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
5
+ +# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
6
+ +# Otherwise, the output redirection ">" will not work.
7
+ +
8
+ import json
9
+ import os
10
+ import sys
11
+ @@ -270,7 +274,7 @@
12
+ working_dir=working_dir,
13
+ )
14
+ job_id = client.submit_job(
15
+ - entrypoint=list2cmdline(entrypoint),
16
+ + entrypoint=" ".join(entrypoint),
17
+ submission_id=submission_id,
18
+ runtime_env=final_runtime_env,
19
+ metadata=metadata_json,
@@ -0,0 +1,17 @@
1
+ --- a/command_runner.py
2
+ +++ b/command_runner.py
3
+ @@ -1,3 +1,5 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
5
+ +
6
+ import hashlib
7
+ import json
8
+ import logging
9
+ @@ -137,7 +139,7 @@
10
+ {
11
+ "ControlMaster": "auto",
12
+ "ControlPath": "{}/%C".format(control_path),
13
+ - "ControlPersist": "10s",
14
+ + "ControlPersist": "300s",
15
+ }
16
+ )
17
+ self.arg_dict.update(kwargs)
@@ -0,0 +1,20 @@
1
+ --- a/log_monitor.py
2
+ +++ b/log_monitor.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
5
+ +# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
6
+ +# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
7
+ +
8
+ import argparse
9
+ import errno
10
+ import glob
11
+ @@ -374,7 +378,8 @@
12
+ next_line = next_line.decode("utf-8", "replace")
13
+ if next_line == "":
14
+ break
15
+ - next_line = next_line.rstrip("\r\n")
16
+ + if next_line.endswith("\n"):
17
+ + next_line = next_line[:-1]
18
+
19
+ if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
20
+ flush() # Possible change of task/actor name.
@@ -0,0 +1,32 @@
1
+ --- a/resource_demand_scheduler.py
2
+ +++ b/resource_demand_scheduler.py
3
+ @@ -1,3 +1,8 @@
4
+ +# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
5
+ +# Sky patch changes:
6
+ +# - no new nodes are allowed to be launched launched when the upscaling_speed is 0
7
+ +# - comment out "assert not unfulfilled": this seems a buggy assert
8
+ +
9
+ """Implements multi-node-type autoscaling.
10
+
11
+ This file implements an autoscaling algorithm that is aware of multiple node
12
+ @@ -448,7 +453,10 @@
13
+ + placement_group_nodes.get(node_type, 0),
14
+ )
15
+
16
+ - if upper_bound > 0:
17
+ + # NOTE(sky): do not autoscale when upsclaing speed is 0.
18
+ + if self.upscaling_speed == 0:
19
+ + upper_bound = 0
20
+ + if upper_bound >= 0:
21
+ updated_nodes_to_launch[node_type] = min(
22
+ upper_bound, to_launch[node_type]
23
+ )
24
+ @@ -592,7 +600,7 @@
25
+ unfulfilled, including_reserved = get_bin_pack_residual(
26
+ new_node_resources, unfulfilled, strict_spread=True
27
+ )
28
+ - assert not unfulfilled
29
+ + # assert not unfulfilled # NOTE(sky): buggy assert.
30
+ node_resources += including_reserved
31
+ return to_add, node_resources, node_type_counts
32
+
@@ -0,0 +1,18 @@
1
+ --- a/updater.py
2
+ +++ b/updater.py
3
+ @@ -1,3 +1,7 @@
4
+ +# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
5
+ +# Sky patch changes:
6
+ +# - Ensure the node state is refreshed before checking the node is terminated.
7
+ +
8
+ import logging
9
+ import os
10
+ import subprocess
11
+ @@ -325,6 +329,7 @@
12
+ )
13
+
14
+ time.sleep(READY_CHECK_INTERVAL)
15
+ + self.provider.non_terminated_nodes({})
16
+
17
+ def do_update(self):
18
+ self.provider.set_node_tags(