skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/log_lib.py CHANGED
@@ -4,14 +4,17 @@ This is a remote utility module that provides logging functionality.
4
4
  """
5
5
  import collections
6
6
  import copy
7
+ import functools
7
8
  import io
8
9
  import multiprocessing.pool
9
10
  import os
11
+ import queue as queue_lib
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
14
16
  import textwrap
17
+ import threading
15
18
  import time
16
19
  from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
17
20
  Tuple, Union)
@@ -21,6 +24,8 @@ import colorama
21
24
  from sky import sky_logging
22
25
  from sky.skylet import constants
23
26
  from sky.skylet import job_lib
27
+ from sky.utils import context
28
+ from sky.utils import context_utils
24
29
  from sky.utils import log_utils
25
30
  from sky.utils import subprocess_utils
26
31
  from sky.utils import ux_utils
@@ -36,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
36
41
 
37
42
  LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
38
43
 
44
+ # 16-64KiB seems to be the sweet spot:
45
+ # https://github.com/grpc/grpc.github.io/issues/371
46
+ # TODO(kevin): Benchmark this ourselves and verify.
47
+ DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
48
+
39
49
 
40
50
  class _ProcessingArgs:
41
51
  """Arguments for processing logs."""
@@ -59,6 +69,16 @@ class _ProcessingArgs:
59
69
  self.streaming_prefix = streaming_prefix
60
70
 
61
71
 
72
+ def _get_context():
73
+ # TODO(aylei): remove this after we drop the backward-compatibility for
74
+ # 0.9.x in 0.12.0
75
+ # Keep backward-compatibility for the old version of SkyPilot runtimes.
76
+ if 'context' in globals():
77
+ return context.get()
78
+ else:
79
+ return None
80
+
81
+
62
82
  def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
63
83
  """Process the stream of a process."""
64
84
  out_io = io.TextIOWrapper(io_stream,
@@ -77,6 +97,9 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
77
97
  with open(args.log_path, 'a', encoding='utf-8') as fout:
78
98
  with line_processor:
79
99
  while True:
100
+ ctx = _get_context()
101
+ if ctx is not None and ctx.is_canceled():
102
+ return
80
103
  line = out_io.readline()
81
104
  if not line:
82
105
  break
@@ -111,26 +134,24 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
111
134
  return ''.join(out)
112
135
 
113
136
 
114
- def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
115
- """Redirect the process's filtered stdout/stderr to both stream and file"""
137
+ def process_subprocess_stream(proc, stdout_stream_handler,
138
+ stderr_stream_handler) -> Tuple[str, str]:
139
+ """Process the stream of a process in threads, blocking."""
116
140
  if proc.stderr is not None:
117
141
  # Asyncio does not work as the output processing can be executed in a
118
142
  # different thread.
119
143
  # selectors is possible to handle the multiplexing of stdout/stderr,
120
144
  # but it introduces buffering making the output not streaming.
121
145
  with multiprocessing.pool.ThreadPool(processes=1) as pool:
122
- err_args = copy.copy(args)
123
- err_args.line_processor = None
124
- stderr_fut = pool.apply_async(_handle_io_stream,
125
- args=(proc.stderr, sys.stderr,
126
- err_args))
146
+ stderr_fut = pool.apply_async(stderr_stream_handler,
147
+ args=(proc.stderr, sys.stderr))
127
148
  # Do not launch a thread for stdout as the rich.status does not
128
149
  # work in a thread, which is used in
129
150
  # log_utils.RayUpLineProcessor.
130
- stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
151
+ stdout = stdout_stream_handler(proc.stdout, sys.stdout)
131
152
  stderr = stderr_fut.get()
132
153
  else:
133
- stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
154
+ stdout = stdout_stream_handler(proc.stdout, sys.stdout)
134
155
  stderr = ''
135
156
  return stdout, stderr
136
157
 
@@ -176,7 +197,12 @@ def run_with_log(
176
197
  # Redirect stderr to stdout when using ray, to preserve the order of
177
198
  # stdout and stderr.
178
199
  stdout_arg = stderr_arg = None
179
- if process_stream:
200
+ ctx = _get_context()
201
+ if process_stream or ctx is not None:
202
+ # Capture stdout/stderr of the subprocess if:
203
+ # 1. Post-processing is needed (process_stream=True)
204
+ # 2. Potential contextual handling is needed (ctx is not None)
205
+ # TODO(aylei): can we always capture the stdout/stderr?
180
206
  stdout_arg = subprocess.PIPE
181
207
  stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
182
208
  # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
@@ -194,9 +220,18 @@ def run_with_log(
194
220
  stdin=stdin,
195
221
  **kwargs) as proc:
196
222
  try:
197
- subprocess_utils.kill_process_daemon(proc.pid)
223
+ if ctx is not None:
224
+ # When runs in coroutine, use kill_pg if available to avoid
225
+ # the overhead of refreshing the process tree in the daemon.
226
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
227
+ else:
228
+ # For backward compatibility, do not specify use_kill_pg by
229
+ # default.
230
+ subprocess_utils.kill_process_daemon(proc.pid)
198
231
  stdout = ''
199
232
  stderr = ''
233
+ stdout_stream_handler = None
234
+ stderr_stream_handler = None
200
235
 
201
236
  if process_stream:
202
237
  if skip_lines is None:
@@ -223,7 +258,34 @@ def run_with_log(
223
258
  replace_crlf=with_ray,
224
259
  streaming_prefix=streaming_prefix,
225
260
  )
226
- stdout, stderr = process_subprocess_stream(proc, args)
261
+ stdout_stream_handler = functools.partial(
262
+ _handle_io_stream,
263
+ args=args,
264
+ )
265
+ if proc.stderr is not None:
266
+ err_args = copy.copy(args)
267
+ err_args.line_processor = None
268
+ stderr_stream_handler = functools.partial(
269
+ _handle_io_stream,
270
+ args=err_args,
271
+ )
272
+ if ctx is not None:
273
+ # When runs in a coroutine, always process the subprocess
274
+ # stream to:
275
+ # 1. handle context cancellation
276
+ # 2. redirect subprocess stdout/stderr to the contextual
277
+ # stdout/stderr of current coroutine.
278
+ stdout, stderr = context_utils.pipe_and_wait_process(
279
+ ctx,
280
+ proc,
281
+ stdout_stream_handler=stdout_stream_handler,
282
+ stderr_stream_handler=stderr_stream_handler)
283
+ elif process_stream:
284
+ # When runs in a process, only process subprocess stream if
285
+ # necessary to avoid unnecessary stream handling overhead.
286
+ stdout, stderr = process_subprocess_stream(
287
+ proc, stdout_stream_handler, stderr_stream_handler)
288
+ # Ensure returncode is set.
227
289
  proc.wait()
228
290
  if require_outputs:
229
291
  return proc.returncode, stdout, stderr
@@ -305,6 +367,17 @@ def run_bash_command_with_log(bash_command: str,
305
367
  shell=True)
306
368
 
307
369
 
370
+ def run_bash_command_with_log_and_return_pid(
371
+ bash_command: str,
372
+ log_path: str,
373
+ env_vars: Optional[Dict[str, str]] = None,
374
+ stream_logs: bool = False,
375
+ with_ray: bool = False):
376
+ return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
377
+ stream_logs, with_ray)
378
+ return {'return_code': return_code, 'pid': os.getpid()}
379
+
380
+
308
381
  def _follow_job_logs(file,
309
382
  job_id: int,
310
383
  start_streaming: bool,
@@ -346,9 +419,9 @@ def _follow_job_logs(file,
346
419
  wait_last_logs = False
347
420
  continue
348
421
  status_str = status.value if status is not None else 'None'
349
- print(ux_utils.finishing_message(
350
- f'Job finished (status: {status_str}).'),
351
- flush=True)
422
+ finish = ux_utils.finishing_message(
423
+ f'Job finished (status: {status_str}).')
424
+ yield finish + '\n'
352
425
  return
353
426
 
354
427
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
@@ -495,9 +568,215 @@ def tail_logs(job_id: Optional[int],
495
568
  if start_streaming:
496
569
  print(line, end='', flush=True)
497
570
  status_str = status.value if status is not None else 'None'
498
- print(ux_utils.finishing_message(
499
- f'Job finished (status: {status_str}).'),
500
- flush=True)
571
+ # Only show "Job finished" for actually terminal states
572
+ if status is not None and status.is_terminal():
573
+ print(ux_utils.finishing_message(
574
+ f'Job finished (status: {status_str}).'),
575
+ flush=True)
501
576
  except FileNotFoundError:
502
577
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
503
578
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
579
+
580
+
581
+ def tail_logs_iter(job_id: Optional[int],
582
+ log_dir: Optional[str],
583
+ managed_job_id: Optional[int] = None,
584
+ follow: bool = True,
585
+ tail: int = 0) -> Iterator[str]:
586
+ """Tail the logs of a job. This is mostly the same as tail_logs, but
587
+ returns an iterator instead of printing to stdout/stderr."""
588
+ if job_id is None:
589
+ # This only happens when job_lib.get_latest_job_id() returns None,
590
+ # which means no job has been submitted to this cluster. See
591
+ # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
592
+ logger.info('Skip streaming logs as no job has been submitted.')
593
+ return
594
+ job_str = f'job {job_id}'
595
+ if managed_job_id is not None:
596
+ job_str = f'managed job {managed_job_id}'
597
+ if log_dir is None:
598
+ msg = f'{job_str.capitalize()} not found (see `sky queue`).'
599
+ yield msg + '\n'
600
+ return
601
+ logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
602
+ f'{managed_job_id}.')
603
+ log_path = os.path.join(log_dir, 'run.log')
604
+ log_path = os.path.expanduser(log_path)
605
+
606
+ status = job_lib.update_job_status([job_id], silent=True)[0]
607
+
608
+ # Wait for the log to be written. This is needed due to the `ray submit`
609
+ # will take some time to start the job and write the log.
610
+ retry_cnt = 0
611
+ while status is not None and not status.is_terminal():
612
+ retry_cnt += 1
613
+ if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
614
+ break
615
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
616
+ err = (f'{colorama.Fore.RED}ERROR: Logs for '
617
+ f'{job_str} (status: {status.value}) does not exist '
618
+ f'after retrying {retry_cnt} times.'
619
+ f'{colorama.Style.RESET_ALL}')
620
+ yield err + '\n'
621
+ return
622
+ waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
623
+ 'to be written...')
624
+ yield waiting + '\n'
625
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
626
+ status = job_lib.update_job_status([job_id], silent=True)[0]
627
+
628
+ start_stream_at = LOG_FILE_START_STREAMING_AT
629
+ # Explicitly declare the type to avoid mypy warning.
630
+ lines: Iterable[str] = []
631
+ if follow and status in [
632
+ job_lib.JobStatus.SETTING_UP,
633
+ job_lib.JobStatus.PENDING,
634
+ job_lib.JobStatus.RUNNING,
635
+ ]:
636
+ # Not using `ray job logs` because it will put progress bar in
637
+ # multiple lines.
638
+ with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
639
+ # Using `_follow` instead of `tail -f` to streaming the whole
640
+ # log and creating a new process for tail.
641
+ start_streaming = False
642
+ if tail > 0:
643
+ head_lines_of_log_file = _peek_head_lines(log_file)
644
+ lines = collections.deque(log_file, maxlen=tail)
645
+ start_streaming = _should_stream_the_whole_tail_lines(
646
+ head_lines_of_log_file, lines, start_stream_at)
647
+ for line in lines:
648
+ if start_stream_at in line:
649
+ start_streaming = True
650
+ if start_streaming:
651
+ yield line
652
+ # Now, the cursor is at the end of the last lines
653
+ # if tail > 0
654
+ for line in _follow_job_logs(log_file,
655
+ job_id=job_id,
656
+ start_streaming=start_streaming,
657
+ start_streaming_at=start_stream_at):
658
+ yield line
659
+ else:
660
+ try:
661
+ start_streaming = False
662
+ with open(log_path, 'r', encoding='utf-8') as log_file:
663
+ if tail > 0:
664
+ # If tail > 0, we need to read the last n lines.
665
+ # We use double ended queue to rotate the last n lines.
666
+ head_lines_of_log_file = _peek_head_lines(log_file)
667
+ lines = collections.deque(log_file, maxlen=tail)
668
+ start_streaming = _should_stream_the_whole_tail_lines(
669
+ head_lines_of_log_file, lines, start_stream_at)
670
+ else:
671
+ lines = log_file
672
+ for line in lines:
673
+ if start_stream_at in line:
674
+ start_streaming = True
675
+ if start_streaming:
676
+ yield line
677
+ status_str = status.value if status is not None else 'None'
678
+ # Only show "Job finished" for actually terminal states
679
+ if status is not None and status.is_terminal():
680
+ finish = ux_utils.finishing_message(
681
+ f'Job finished (status: {status_str}).')
682
+ yield finish + '\n'
683
+ return
684
+ except FileNotFoundError:
685
+ err = (
686
+ f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
687
+ f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
688
+ yield err + '\n'
689
+
690
+
691
+ class LogBuffer:
692
+ """In-memory buffer for chunking log lines for streaming."""
693
+
694
+ def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
695
+ """Initialize the log buffer.
696
+
697
+ Args:
698
+ max_chars: Maximum buffer size (in characters, not bytes) before
699
+ flushing. The actual amount of bytes (UTF-8 encoding)
700
+ could be more than this, depending on the characters,
701
+ i.e. ASCII characters take 1 byte, while others
702
+ may take 2-4 bytes. But this is fine as our default
703
+ chunk size is well below the default value of
704
+ grpc.max_receive_message_length which is 4MB.
705
+ """
706
+ self.max_chars = max_chars
707
+ self._buffer = io.StringIO()
708
+
709
+ def _should_flush(self) -> bool:
710
+ return self._buffer.tell() >= self.max_chars
711
+
712
+ def flush(self) -> str:
713
+ """Get the current buffered content and clear the buffer.
714
+
715
+ Returns:
716
+ The buffered log lines as a single string
717
+ """
718
+ if not self._buffer.tell():
719
+ return ''
720
+ chunk = self._buffer.getvalue()
721
+ self._buffer.truncate(0)
722
+ self._buffer.seek(0)
723
+ return chunk
724
+
725
+ def write(self, line: str) -> bool:
726
+ """Add a line to the buffer.
727
+
728
+ Args:
729
+ line: The log line to add
730
+
731
+ Returns:
732
+ True if buffer should be flushed after adding the line
733
+ """
734
+ self._buffer.write(line)
735
+ return self._should_flush()
736
+
737
+ def close(self):
738
+ self._buffer.close()
739
+
740
+
741
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
742
+ timeout: float) -> Iterable[str]:
743
+ """Iterates over an iterable, writing each item to a buffer,
744
+ and flushing the buffer when it is full or no item is
745
+ yielded within the timeout duration."""
746
+ # TODO(kevin): Simplify this using asyncio.timeout, once we move
747
+ # the skylet event loop and gRPC server to asyncio.
748
+ # https://docs.python.org/3/library/asyncio-task.html#timeouts
749
+
750
+ queue: queue_lib.Queue = queue_lib.Queue()
751
+ sentinel = object()
752
+
753
+ def producer():
754
+ try:
755
+ for item in iterable:
756
+ queue.put(item)
757
+ finally:
758
+ queue.put(sentinel)
759
+
760
+ thread = threading.Thread(target=producer, daemon=True)
761
+ thread.start()
762
+
763
+ while True:
764
+ try:
765
+ item = queue.get(timeout=timeout)
766
+ except queue_lib.Empty:
767
+ out = buffer.flush()
768
+ if out:
769
+ yield out
770
+ continue
771
+
772
+ if item is sentinel:
773
+ thread.join()
774
+ out = buffer.flush()
775
+ if out:
776
+ yield out
777
+ return
778
+
779
+ if buffer.write(item):
780
+ out = buffer.flush()
781
+ if out:
782
+ yield out
sky/skylet/log_lib.pyi CHANGED
@@ -4,13 +4,14 @@ overloaded type hints for run_with_log(), as we need to determine
4
4
  the return type based on the value of require_outputs.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple, Union
7
+ from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
8
8
 
9
9
  from typing_extensions import Literal
10
10
 
11
11
  from sky import sky_logging as sky_logging
12
12
  from sky.skylet import constants as constants
13
13
  from sky.skylet import job_lib as job_lib
14
+ from sky.utils import context
14
15
  from sky.utils import log_utils as log_utils
15
16
 
16
17
  SKY_LOG_WAITING_GAP_SECONDS: int = ...
@@ -41,6 +42,10 @@ class _ProcessingArgs:
41
42
  ...
42
43
 
43
44
 
45
+ def _get_context() -> Optional[context.SkyPilotContext]:
46
+ ...
47
+
48
+
44
49
  def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
45
50
  ...
46
51
 
@@ -124,8 +129,46 @@ def run_bash_command_with_log(bash_command: str,
124
129
  ...
125
130
 
126
131
 
132
+ def run_bash_command_with_log_and_return_pid(
133
+ bash_command: str,
134
+ log_path: str,
135
+ env_vars: Optional[Dict[str, str]] = ...,
136
+ stream_logs: bool = ...,
137
+ with_ray: bool = ...):
138
+ ...
139
+
140
+
127
141
  def tail_logs(job_id: int,
128
142
  log_dir: Optional[str],
129
143
  managed_job_id: Optional[int] = ...,
130
144
  follow: bool = ...) -> None:
131
145
  ...
146
+
147
+
148
+ def tail_logs_iter(job_id: Optional[int],
149
+ log_dir: Optional[str],
150
+ managed_job_id: Optional[int] = ...,
151
+ follow: bool = ...,
152
+ tail: int = ...) -> Iterator[str]:
153
+ ...
154
+
155
+
156
+ class LogBuffer:
157
+ max_chars: int
158
+
159
+ def __init__(self, max_chars: int = ...):
160
+ ...
161
+
162
+ def flush(self) -> str:
163
+ ...
164
+
165
+ def write(self, line: str) -> bool:
166
+ ...
167
+
168
+ def close(self):
169
+ ...
170
+
171
+
172
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
173
+ timeout: float) -> Iterable[str]:
174
+ ...
@@ -24,7 +24,7 @@ import socket
24
24
  import threading
25
25
  import time
26
26
  from pathlib import Path
27
- from pprint import pprint
27
+ from pprint import pformat, pprint
28
28
  from typing import Any, Dict, List, Optional
29
29
  from uuid import uuid4
30
30
 
@@ -67,13 +67,13 @@ def log_in_out(func):
67
67
  logger.debug(
68
68
  f"\n\nEnter {name} from {inspect.stack()[0][3]} "
69
69
  f"{inspect.stack()[1][3]} {inspect.stack()[2][3]} with args: "
70
- f"entered with args:\n{pprint(args)} and kwargs {pprint(kwargs)}"
70
+ f"entered with args:\n{pformat(args)} and kwargs {pformat(kwargs)}"
71
71
  )
72
72
  try:
73
73
  result = func(*args, **kwargs)
74
74
  logger.debug(
75
75
  f"Leave {name} from {inspect.stack()[1][3]} with result "
76
- f"Func Result:{pprint(result)}\n\n"
76
+ f"Func Result:{pformat(result)}\n\n"
77
77
  )
78
78
  except Exception:
79
79
  cli_logger.error(f"Error in {name}")
@@ -445,7 +445,7 @@ class IBMVPCNodeProvider(NodeProvider):
445
445
  """returns the worker's node private ip address"""
446
446
  node = self._get_cached_node(node_id)
447
447
 
448
- # if a bug ocurred, or node data was fetched before primary_ip
448
+ # if a bug occurred, or node data was fetched before primary_ip
449
449
  # was assigned, refetch node data from cloud.
450
450
  try:
451
451
  primary_ip = node["network_interfaces"][0].get("primary_ip")["address"]
@@ -502,8 +502,12 @@ class IBMVPCNodeProvider(NodeProvider):
502
502
 
503
503
  logger.info(f"Creating new VM instance {name}")
504
504
 
505
- security_group_identity_model = {"id": self.vpc_tags["security_group_id"]}
506
- subnet_identity_model = {"id": self.vpc_tags["subnet_id"]}
505
+ if self.vpc_tags is None:
506
+ raise ValueError("vpc_tags must be initialized before creating instances")
507
+ vpc_tags = self.vpc_tags # Help mypy with type narrowing
508
+
509
+ security_group_identity_model = {"id": vpc_tags["security_group_id"]}
510
+ subnet_identity_model = {"id": vpc_tags["subnet_id"]}
507
511
  primary_network_interface = {
508
512
  "name": "eth0",
509
513
  "subnet": subnet_identity_model,
@@ -536,7 +540,7 @@ class IBMVPCNodeProvider(NodeProvider):
536
540
  instance_prototype["keys"] = [key_identity_model]
537
541
  instance_prototype["profile"] = {"name": profile_name}
538
542
  instance_prototype["resource_group"] = {"id": self.resource_group_id}
539
- instance_prototype["vpc"] = {"id": self.vpc_tags["vpc_id"]}
543
+ instance_prototype["vpc"] = {"id": vpc_tags["vpc_id"]}
540
544
  instance_prototype["image"] = {"id": base_config["image_id"]}
541
545
 
542
546
  instance_prototype["zone"] = {"name": self.zone}
@@ -584,7 +588,7 @@ class IBMVPCNodeProvider(NodeProvider):
584
588
  floating_ip_name = f"{RAY_RECYCLABLE}-{uuid4().hex[:4]}"
585
589
  # create a new floating ip
586
590
  logger.debug(f"Creating floating IP {floating_ip_name}")
587
- floating_ip_prototype = {}
591
+ floating_ip_prototype: Dict[str, Any] = {}
588
592
  floating_ip_prototype["name"] = floating_ip_name
589
593
  floating_ip_prototype["zone"] = {"name": self.zone}
590
594
  floating_ip_prototype["resource_group"] = {"id": self.resource_group_id}
@@ -10,6 +10,7 @@ import textwrap
10
10
  import time
11
11
  import uuid
12
12
  from concurrent.futures import ThreadPoolExecutor
13
+ from typing import Any, Dict
13
14
 
14
15
  import requests
15
16
 
@@ -173,7 +174,7 @@ class IBMVPCProvider:
173
174
  "a subnet"
174
175
  )
175
176
 
176
- subnet_prototype = {}
177
+ subnet_prototype: Dict[str, Any] = {}
177
178
  subnet_prototype["zone"] = {"name": zone_name}
178
179
  subnet_prototype["ip_version"] = "ipv4"
179
180
  subnet_prototype["name"] = subnet_name
@@ -186,7 +187,7 @@ class IBMVPCProvider:
186
187
 
187
188
  def create_public_gateway(self, vpc_id, zone_name, subnet_data):
188
189
 
189
- gateway_prototype = {}
190
+ gateway_prototype: Dict[str, Any] = {}
190
191
  gateway_prototype["vpc"] = {"id": vpc_id}
191
192
  gateway_prototype["zone"] = {"name": zone_name}
192
193
  gateway_prototype["name"] = f"{subnet_data['name']}-gw"
@@ -345,7 +346,7 @@ class IBMVPCProvider:
345
346
  return True
346
347
  tries -= 1
347
348
  time.sleep(sleep_interval)
348
- logger.error("Failed to delete instance within the alloted time\n")
349
+ logger.error("Failed to delete instance within the allotted time\n")
349
350
  return False
350
351
 
351
352
  for subnet_id in self.get_vpc_subnets(vpc_data, region, field="id"):
@@ -522,7 +523,7 @@ class ClusterCleaner:
522
523
  if e.code == 404:
523
524
  print(("VPC doesn't exist."))
524
525
  return None
525
- else: raise
526
+ else: raise
526
527
 
527
528
  def delete_subnets(vpc_data):
528
529
  def _poll_subnet_exists(subnet_id):
@@ -560,12 +561,12 @@ class ClusterCleaner:
560
561
  deleting_resource = False
561
562
  except ibm_cloud_sdk_core.ApiException as e:
562
563
  if e.code == 404:
563
- print("gateway doesn't exist.")
564
+ print("gateway doesn't exist.")
564
565
  deleting_resource = False
565
566
  if e.code == 409:
566
567
  print("gateway still in use.")
567
- # will retry until cloud functions timeout.
568
- time.sleep(5)
568
+ # will retry until cloud functions timeout.
569
+ time.sleep(5)
569
570
 
570
571
  def delete_vms(vpc_id):
571
572
  def _poll_vpc_contains_vms(vpc_id):
@@ -586,7 +587,7 @@ class ClusterCleaner:
586
587
  )
587
588
 
588
589
  def _del_instance(vm_data):
589
- # first delete ips created by node_provider
590
+ # first delete ips created by node_provider
590
591
  nic_id = vm_data["network_interfaces"][0]["id"]
591
592
  res = ibm_vpc_client.list_instance_network_interface_floating_ips(
592
593
  vm_data["id"], nic_id
@@ -598,7 +599,7 @@ class ClusterCleaner:
598
599
  ibm_vpc_client.delete_floating_ip(ip["id"])
599
600
  print(f"Deleting VM: {vm_data['id']}")
600
601
  ibm_vpc_client.delete_instance(id=vm_data["id"])
601
-
602
+
602
603
  res = ibm_vpc_client.list_instances(vpc_id=vpc_id).get_result()
603
604
  num_instances = res["total_count"]
604
605
 
@@ -619,12 +620,12 @@ class ClusterCleaner:
619
620
  deleting_resource = False
620
621
  except ibm_cloud_sdk_core.ApiException as e:
621
622
  if e.code == 404:
622
- print("VPC doesn't exist.")
623
+ print("VPC doesn't exist.")
623
624
  deleting_resource = False
624
625
  if e.code == 409:
625
626
  print("VPC still in use.")
626
- # will retry until cloud functions timeout.
627
- time.sleep(5)
627
+ # will retry until cloud functions timeout.
628
+ time.sleep(5)
628
629
 
629
630
  def delete_vpc(vpc_id):
630
631
  vpc_data = get_vpc_data(vpc_id)
@@ -40,15 +40,29 @@ def _run_patch(target_file,
40
40
  """Applies a patch if it has not been applied already."""
41
41
  # .orig is the original file that is not patched.
42
42
  orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
43
+ # Get diff filename by replacing .patch with .diff
44
+ diff_file = patch_file.replace('.patch', '.diff')
45
+
43
46
  script = f"""\
44
47
  which patch >/dev/null 2>&1 || sudo yum install -y patch || true
45
- which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
46
48
  if [ ! -f {orig_file} ]; then
47
49
  echo Create backup file {orig_file}
48
50
  cp {target_file} {orig_file}
49
51
  fi
50
- # It is ok to patch again from the original file.
51
- patch {orig_file} -i {patch_file} -o {target_file}
52
+ if which patch >/dev/null 2>&1; then
53
+ # System patch command is available, use it
54
+ # It is ok to patch again from the original file.
55
+ patch {orig_file} -i {patch_file} -o {target_file}
56
+ else
57
+ # System patch command not available, use Python patch library
58
+ echo "System patch command not available, using Python patch library..."
59
+ python -m pip install patch
60
+ # Get target directory
61
+ target_dir="$(dirname {target_file})"
62
+ # Execute python patch command
63
+ echo "Executing python -m patch -d $target_dir {diff_file}"
64
+ python -m patch -d "$target_dir" "{diff_file}"
65
+ fi
52
66
  """
53
67
  subprocess.run(script, shell=True, check=True)
54
68