skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/utils/common_utils.py CHANGED
@@ -1,8 +1,10 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
+ import ctypes
3
4
  import difflib
4
5
  import enum
5
6
  import functools
7
+ import gc
6
8
  import getpass
7
9
  import hashlib
8
10
  import inspect
@@ -263,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
263
265
 
264
266
  class Backoff:
265
267
  """Exponential backoff with jittering."""
266
- MULTIPLIER = 1.6
267
268
  JITTER = 0.4
268
269
 
269
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
270
274
  self._initial = True
271
275
  self._backoff = 0.0
272
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
273
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
274
279
 
275
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -281,7 +286,7 @@ class Backoff:
281
286
  self._initial = False
282
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
283
288
  else:
284
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
285
290
  self._max_backoff)
286
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
287
292
  self.JITTER * self._backoff)
@@ -295,6 +300,7 @@ _current_user: Optional['models.User'] = None
295
300
  _current_request_id: Optional[str] = None
296
301
 
297
302
 
303
+ # TODO(aylei,hailong): request context should be contextual
298
304
  def set_request_context(client_entrypoint: Optional[str],
299
305
  client_command: Optional[str],
300
306
  using_remote_api_server: bool,
@@ -336,19 +342,32 @@ def get_current_command() -> str:
336
342
 
337
343
 
338
344
  def get_current_user() -> 'models.User':
339
- """Returns the current user."""
345
+ """Returns the user in current server session."""
340
346
  if _current_user is not None:
341
347
  return _current_user
342
348
  return models.User.get_current_user()
343
349
 
344
350
 
345
351
  def get_current_user_name() -> str:
346
- """Returns the current user name."""
352
+ """Returns the user name in current server session."""
347
353
  name = get_current_user().name
348
354
  assert name is not None
349
355
  return name
350
356
 
351
357
 
358
+ def get_local_user_name() -> str:
359
+ """Returns the user name in local environment.
360
+
361
+ This is for backward compatibility where anonymous access is implicitly
362
+ allowed when no authentication method at server-side is configured and
363
+ the username from client environment variable will be used to identify the
364
+ user.
365
+ """
366
+ name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
367
+ assert name is not None
368
+ return name
369
+
370
+
352
371
  def set_current_user(user: 'models.User'):
353
372
  """Sets the current user."""
354
373
  global _current_user
@@ -719,7 +738,8 @@ def find_free_port(start_port: int) -> int:
719
738
  try:
720
739
  s.bind(('', port))
721
740
  return port
722
- except OSError:
741
+ except OSError as e:
742
+ logger.debug(f'Error binding port {port}: {e}')
723
743
  pass
724
744
  raise OSError('No free ports available.')
725
745
 
@@ -994,7 +1014,17 @@ def get_mem_size_gb() -> float:
994
1014
  except ValueError as e:
995
1015
  with ux_utils.print_exception_no_traceback():
996
1016
  raise ValueError(
997
- f'Failed to parse the memory size from {mem_size}') from e
1017
+ f'Failed to parse the memory size from {mem_size} (GB)'
1018
+ ) from e
1019
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
1020
+ if mem_size is not None:
1021
+ try:
1022
+ return float(mem_size) / (1024**3)
1023
+ except ValueError as e:
1024
+ with ux_utils.print_exception_no_traceback():
1025
+ raise ValueError(
1026
+ f'Failed to parse the memory size from {mem_size} (bytes)'
1027
+ ) from e
998
1028
  return _mem_size_gb()
999
1029
 
1000
1030
 
@@ -1090,3 +1120,21 @@ def removeprefix(string: str, prefix: str) -> str:
1090
1120
  if string.startswith(prefix):
1091
1121
  return string[len(prefix):]
1092
1122
  return string
1123
+
1124
+
1125
+ def release_memory():
1126
+ """Release the process memory"""
1127
+ # Do the best effort to release the python heap and let malloc_trim
1128
+ # be more efficient.
1129
+ try:
1130
+ gc.collect()
1131
+ if sys.platform.startswith('linux'):
1132
+ # Will fail on musl (alpine), but at least it works on our
1133
+ # official docker images.
1134
+ libc = ctypes.CDLL('libc.so.6')
1135
+ return libc.malloc_trim(0)
1136
+ return 0
1137
+ except Exception as e: # pylint: disable=broad-except
1138
+ logger.error(f'Failed to release memory: '
1139
+ f'{format_exception(e)}')
1140
+ return 0
sky/utils/config_utils.py CHANGED
@@ -272,7 +272,7 @@ def get_cloud_config_value_from_dict(
272
272
  """
273
273
  input_config = Config(dict_config)
274
274
  region_key = None
275
- if cloud == 'kubernetes':
275
+ if cloud in ('kubernetes', 'ssh'):
276
276
  region_key = 'context_configs'
277
277
  elif cloud in _REGION_CONFIG_CLOUDS:
278
278
  region_key = 'region_configs'
@@ -283,19 +283,6 @@ def get_cloud_config_value_from_dict(
283
283
  keys=(cloud, region_key, region) + keys,
284
284
  default_value=None,
285
285
  override_configs=override_configs)
286
- if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
287
- # TODO (kyuds): Backward compatibility, remove after 0.11.0.
288
- per_context_config = input_config.get_nested(
289
- keys=(cloud, region) + keys,
290
- default_value=None,
291
- override_configs=override_configs)
292
- if per_context_config is not None:
293
- logger.info(
294
- f'{cloud} configuration is using the legacy format. \n'
295
- 'This format will be deprecated after 0.11.0, refer to '
296
- '`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
297
- 'for the new format. Please use `region_configs` to specify region specific configuration.'
298
- )
299
286
  # if no override found for specified region
300
287
  general_config = input_config.get_nested(keys=(cloud,) + keys,
301
288
  default_value=default_value,
sky/utils/context.py CHANGED
@@ -2,18 +2,23 @@
2
2
 
3
3
  import asyncio
4
4
  from collections.abc import Mapping
5
- from collections.abc import MutableMapping
6
5
  import contextvars
6
+ import copy
7
7
  import functools
8
8
  import os
9
9
  import pathlib
10
10
  import subprocess
11
11
  import sys
12
- import typing
13
- from typing import Any, Callable, Dict, Optional, TextIO, TypeVar
12
+ from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
13
+ Optional, TextIO, TYPE_CHECKING, TypeVar)
14
14
 
15
+ from typing_extensions import ParamSpec
15
16
 
16
- class Context(object):
17
+ if TYPE_CHECKING:
18
+ from sky.skypilot_config import ConfigContext
19
+
20
+
21
+ class SkyPilotContext(object):
17
22
  """SkyPilot typed context vars for threads and coroutines.
18
23
 
19
24
  This is a wrapper around `contextvars.ContextVar` that provides a typed
@@ -88,7 +93,7 @@ class Context(object):
88
93
  else:
89
94
  self._log_file_handle = open(log_file, 'a', encoding='utf-8')
90
95
  self._log_file = log_file
91
- if original_log_file is not None:
96
+ if original_log_handle is not None:
92
97
  original_log_handle.close()
93
98
  return original_log_file
94
99
 
@@ -102,11 +107,40 @@ class Context(object):
102
107
  for k, v in envs.items():
103
108
  self.env_overrides[k] = v
104
109
 
110
+ def cleanup(self):
111
+ """Clean up the context."""
112
+ if self._log_file_handle is not None:
113
+ self._log_file_handle.close()
114
+ self._log_file_handle = None
115
+
116
+ def __enter__(self):
117
+ return self
118
+
119
+ def __exit__(self, exc_type, exc_val, exc_tb):
120
+ del exc_type, exc_val, exc_tb
121
+ self.cleanup()
122
+
123
+ def copy(self) -> 'SkyPilotContext':
124
+ """Create a copy of the context.
125
+
126
+ Changes to the current context after this call will not affect the copy.
127
+ The new context will get its own handle/fd for the log file.
128
+ The new context will get an independent copy of the env var overrides.
129
+ The new context will get an independent copy of the config context.
130
+ Cancellation of the current context will not be propagated to the copy.
131
+ """
132
+ new_context = SkyPilotContext()
133
+ new_context.redirect_log(self._log_file)
134
+ new_context.env_overrides = self.env_overrides.copy()
135
+ new_context.config_context = copy.deepcopy(self.config_context)
136
+ return new_context
137
+
105
138
 
106
- _CONTEXT = contextvars.ContextVar('sky_context', default=None)
139
+ _CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
140
+ default=None)
107
141
 
108
142
 
109
- def get() -> Optional[Context]:
143
+ def get() -> Optional[SkyPilotContext]:
110
144
  """Get the current SkyPilot context.
111
145
 
112
146
  If the context is not initialized, get() will return None. This helps
@@ -116,7 +150,7 @@ def get() -> Optional[Context]:
116
150
  return _CONTEXT.get()
117
151
 
118
152
 
119
- class ContextualEnviron(MutableMapping):
153
+ class ContextualEnviron(MutableMapping[str, str]):
120
154
  """Environment variables wrapper with contextual overrides.
121
155
 
122
156
  An instance of ContextualEnviron will typically be used to replace
@@ -124,7 +158,7 @@ class ContextualEnviron(MutableMapping):
124
158
  aware.
125
159
 
126
160
  Behavior of spawning a subprocess:
127
- - The contexual overrides will not be applied to the subprocess by
161
+ - The contextual overrides will not be applied to the subprocess by
128
162
  default.
129
163
  - When using env=os.environ to pass the environment variables to the
130
164
  subprocess explicitly. The subprocess will inherit the contextual
@@ -155,10 +189,10 @@ class ContextualEnviron(MutableMapping):
155
189
  assert os.environ['FOO'] == 'BAR1'
156
190
  """
157
191
 
158
- def __init__(self, environ):
192
+ def __init__(self, environ: 'os._Environ[str]') -> None:
159
193
  self._environ = environ
160
194
 
161
- def __getitem__(self, key):
195
+ def __getitem__(self, key: str) -> str:
162
196
  ctx = get()
163
197
  if ctx is not None:
164
198
  if key in ctx.env_overrides:
@@ -170,51 +204,63 @@ class ContextualEnviron(MutableMapping):
170
204
  return value
171
205
  return self._environ[key]
172
206
 
173
- def __iter__(self):
174
- ctx = get()
175
- deleted_keys = set()
176
- if ctx is not None:
207
+ def __iter__(self) -> Iterator[str]:
208
+
209
+ def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
210
+ deleted_keys = set()
177
211
  for key, value in ctx.env_overrides.items():
178
212
  if value is None:
179
213
  deleted_keys.add(key)
180
- yield key
214
+ else:
215
+ yield key
181
216
  for key in self._environ:
182
217
  # Deduplicate the keys
183
218
  if key not in ctx.env_overrides and key not in deleted_keys:
184
219
  yield key
220
+
221
+ ctx = get()
222
+ if ctx is not None:
223
+ return iter_from_context(ctx)
185
224
  else:
186
225
  return self._environ.__iter__()
187
226
 
188
- def __len__(self):
227
+ def __len__(self) -> int:
189
228
  return len(dict(self))
190
229
 
191
- def __setitem__(self, key, value):
230
+ def __setitem__(self, key: str, value: str) -> None:
192
231
  ctx = get()
193
232
  if ctx is not None:
194
233
  ctx.env_overrides[key] = value
195
234
  else:
196
235
  self._environ.__setitem__(key, value)
197
236
 
198
- def __delitem__(self, key):
237
+ def __delitem__(self, key: str) -> None:
199
238
  ctx = get()
200
239
  if ctx is not None:
201
- if key in ctx.env_overrides:
202
- del ctx.env_overrides[key]
203
- elif key in self._environ:
204
- # If the key is not set in the context but set in the environ
205
- # of the process, we mark it as deleted in the context by
206
- # setting the value to None.
240
+ if key in self._environ:
241
+ # If the key is set in the environ of the process, we mark it as
242
+ # deleted in the context by setting the value to None.
243
+ # Note: we must do this even if it was also set in the context,
244
+ # since it could be set in both, and deleting should delete it
245
+ # from both.
207
246
  ctx.env_overrides[key] = None
247
+ elif key in ctx.env_overrides:
248
+ # If the key is set in the context, but not the original
249
+ # environ, we can just delete the override.
250
+ del ctx.env_overrides[key]
208
251
  else:
209
252
  # The key is not set in the context nor the process.
210
253
  raise KeyError(key)
211
254
  else:
212
255
  self._environ.__delitem__(key)
213
256
 
214
- def __repr__(self):
215
- return self._environ.__repr__()
257
+ def __repr__(self) -> str:
258
+ # Adapted from os._Environ.__repr__
259
+ formatted_items = ', '.join(
260
+ f'{key!r}: {value!r}' for key, value in self.items())
261
+ return f'ctx_environ({{{formatted_items}}})'
216
262
 
217
- def copy(self):
263
+ def copy(self) -> Dict[str, str]:
218
264
  copied = self._environ.copy()
219
265
  ctx = get()
220
266
  if ctx is not None:
@@ -225,7 +271,7 @@ class ContextualEnviron(MutableMapping):
225
271
  copied[key] = ctx.env_overrides[key]
226
272
  return copied
227
273
 
228
- def setdefault(self, key, default=None):
274
+ def setdefault(self, key: str, default: str) -> str:
229
275
  return self._environ.setdefault(key, default)
230
276
 
231
277
  def __ior__(self, other):
@@ -257,30 +303,71 @@ class Popen(subprocess.Popen):
257
303
  # Pass a copy of current context.environ to avoid race condition
258
304
  # when the context is updated after the Popen is created.
259
305
  env = os.environ.copy()
260
- super().__init__(*args, env=env, **kwargs)
306
+ super().__init__(*args, env=env,
307
+ **kwargs) # type: ignore[call-overload]
261
308
 
262
309
 
263
- F = TypeVar('F', bound=Callable[..., Any])
310
+ P = ParamSpec('P')
311
+ T = TypeVar('T')
264
312
 
265
313
 
266
- def contextual(func: F) -> F:
314
+ def contextual(func: Callable[P, T]) -> Callable[P, T]:
267
315
  """Decorator to initialize a context before executing the function.
268
316
 
269
- If a context is already initialized, this decorator will reset the context,
270
- i.e. all contextual variables set previously will be cleared.
317
+ If a context is already initialized, this decorator will create a new
318
+ context that inherits the values from the existing context.
271
319
  """
272
320
 
321
+ def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
322
+ # Within the new contextvars Context, set up the SkyPilotContext.
323
+ original_ctx = get()
324
+ with initialize(original_ctx):
325
+ return func(*args, **kwargs)
326
+
327
+ @functools.wraps(func)
328
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
329
+ # Create a copy of the current contextvars Context so that setting the
330
+ # SkyPilotContext does not affect the caller's context in async
331
+ # environments.
332
+ context = contextvars.copy_context()
333
+ return context.run(run_in_context, *args, **kwargs)
334
+
335
+ return wrapper
336
+
337
+
338
+ def contextual_async(
339
+ func: Callable[P, Coroutine[Any, Any, T]]
340
+ ) -> Callable[P, Coroutine[Any, Any, T]]:
341
+ """Decorator to initialize a context before executing the function.
342
+
343
+ If a context is already initialized, this decorator will create a new
344
+ context that inherits the values from the existing context.
345
+ """
346
+
347
+ async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
348
+ # Within the new contextvars Context, set up the SkyPilotContext.
349
+ original_ctx = get()
350
+ with initialize(original_ctx):
351
+ return await func(*args, **kwargs)
352
+
273
353
  @functools.wraps(func)
274
- def wrapper(*args, **kwargs):
275
- initialize()
276
- return func(*args, **kwargs)
354
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
355
+ # Create a copy of the current contextvars Context so that setting the
356
+ # SkyPilotContext does not affect the caller's context in async
357
+ # environments.
358
+ context = contextvars.copy_context()
359
+ return await context.run(run_in_context, *args, **kwargs)
277
360
 
278
- return typing.cast(F, wrapper)
361
+ return wrapper
279
362
 
280
363
 
281
- def initialize():
364
+ def initialize(
365
+ base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
282
366
  """Initialize the current SkyPilot context."""
283
- _CONTEXT.set(Context())
367
+ new_context = base_context.copy(
368
+ ) if base_context is not None else SkyPilotContext()
369
+ _CONTEXT.set(new_context)
370
+ return new_context
284
371
 
285
372
 
286
373
  class _ContextualStream:
@@ -1,20 +1,27 @@
1
1
  """Utilities for SkyPilot context."""
2
2
  import asyncio
3
+ import concurrent.futures
3
4
  import contextvars
4
5
  import functools
5
- import io
6
6
  import multiprocessing
7
7
  import os
8
+ import select
8
9
  import subprocess
9
10
  import sys
11
+ import time
10
12
  import typing
11
13
  from typing import Any, Callable, IO, Optional, Tuple, TypeVar
12
14
 
15
+ from typing_extensions import ParamSpec
16
+
13
17
  from sky import sky_logging
14
18
  from sky.utils import context
15
19
  from sky.utils import subprocess_utils
16
20
 
17
21
  StreamHandler = Callable[[IO[Any], IO[Any]], str]
22
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
23
+
24
+ logger = sky_logging.init_logger(__name__)
18
25
 
19
26
 
20
27
  # TODO(aylei): call hijack_sys_attrs() proactivly in module init at server-side
@@ -41,23 +48,53 @@ def hijack_sys_attrs():
41
48
 
42
49
  def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
43
50
  """Passthrough the stream from the process to the output stream"""
44
- wrapped = io.TextIOWrapper(in_stream,
45
- encoding='utf-8',
46
- newline='',
47
- errors='replace',
48
- write_through=True)
51
+ last_flush_time = time.time()
52
+ has_unflushed_content = False
53
+
54
+ # Use poll() with timeout instead of readline() to avoid blocking.
55
+ # readline() blocks until a newline is available, which can take minutes
56
+ # for tasks that emit logs infrequently (e.g. jupyter lab server).
57
+ # While readline() is blocked, the timing code never executes, so buffered
58
+ # logs never get flushed. poll() with timeout allows us to periodically
59
+ # flush even when no new data is available, ensuring logs appear promptly.
60
+ fd = in_stream.fileno()
61
+ poller = select.poll()
62
+ poller.register(fd, select.POLLIN)
63
+
64
+ # Timeout in milliseconds for poll()
65
+ poll_timeout_ms = int(PASSTHROUGH_FLUSH_INTERVAL_SECONDS * 1000)
66
+
49
67
  while True:
50
- line = wrapped.readline()
51
- if line:
52
- out_stream.write(line)
68
+ # Poll with timeout - returns when data available or timeout
69
+ events = poller.poll(poll_timeout_ms)
70
+
71
+ current_time = time.time()
72
+
73
+ if events:
74
+ # Data is available, read a chunk
75
+ chunk = os.read(fd, 4096) # Read up to 4KB
76
+ if not chunk:
77
+ break # EOF
78
+ out_stream.write(chunk.decode('utf-8', errors='replace'))
79
+ has_unflushed_content = True
80
+
81
+ # Flush only if we have unflushed content and timeout reached
82
+ if (has_unflushed_content and current_time - last_flush_time >=
83
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
53
84
  out_stream.flush()
54
- else:
55
- break
85
+ last_flush_time = current_time
86
+ has_unflushed_content = False
87
+
88
+ poller.unregister(fd)
89
+ # Final flush to ensure all data is written
90
+ if has_unflushed_content:
91
+ out_stream.flush()
92
+
56
93
  return ''
57
94
 
58
95
 
59
96
  def pipe_and_wait_process(
60
- ctx: context.Context,
97
+ ctx: context.SkyPilotContext,
61
98
  proc: subprocess.Popen,
62
99
  poll_interval: float = 0.5,
63
100
  cancel_callback: Optional[Callable[[], None]] = None,
@@ -110,7 +147,7 @@ def pipe_and_wait_process(
110
147
  return stdout, stderr
111
148
 
112
149
 
113
- def wait_process(ctx: context.Context,
150
+ def wait_process(ctx: context.SkyPilotContext,
114
151
  proc: subprocess.Popen,
115
152
  poll_interval: float = 0.5,
116
153
  cancel_callback: Optional[Callable[[], None]] = None):
@@ -128,7 +165,11 @@ def wait_process(ctx: context.Context,
128
165
  # Kill the process despite the caller's callback, the utility
129
166
  # function gracefully handles the case where the process is
130
167
  # already terminated.
131
- subprocess_utils.kill_process_with_grace_period(proc)
168
+ # Bash script typically does not forward SIGTERM to childs, thus
169
+ # cannot be killed gracefully, shorten the grace period for faster
170
+ # termination.
171
+ subprocess_utils.kill_process_with_grace_period(proc,
172
+ grace_period=1)
132
173
  raise asyncio.CancelledError()
133
174
  try:
134
175
  proc.wait(poll_interval)
@@ -173,15 +214,29 @@ def cancellation_guard(func: F) -> F:
173
214
  return typing.cast(F, wrapper)
174
215
 
175
216
 
217
+ P = ParamSpec('P')
218
+ T = TypeVar('T')
219
+
220
+
176
221
  # TODO(aylei): replace this with asyncio.to_thread once we drop support for
177
222
  # python 3.8
178
- def to_thread(func, /, *args, **kwargs):
223
+ def to_thread(func: Callable[P, T], /, *args: P.args,
224
+ **kwargs: P.kwargs) -> 'asyncio.Future[T]':
179
225
  """Asynchronously run function *func* in a separate thread.
180
226
 
181
227
  This is same as asyncio.to_thread added in python 3.9
182
228
  """
229
+ return to_thread_with_executor(None, func, *args, **kwargs)
230
+
231
+
232
+ def to_thread_with_executor(executor: Optional[concurrent.futures.Executor],
233
+ func: Callable[P, T], /, *args: P.args,
234
+ **kwargs: P.kwargs) -> 'asyncio.Future[T]':
235
+ """Asynchronously run function *func* in a separate thread with
236
+ a custom executor."""
237
+
183
238
  loop = asyncio.get_running_loop()
184
- # This is critical to pass the current coroutine context to the new thread
185
239
  pyctx = contextvars.copy_context()
186
- func_call = functools.partial(pyctx.run, func, *args, **kwargs)
187
- return loop.run_in_executor(None, func_call)
240
+ func_call: Callable[..., T] = functools.partial(pyctx.run, func, *args,
241
+ **kwargs)
242
+ return loop.run_in_executor(executor, func_call)