skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/job_lib.py CHANGED
@@ -23,20 +23,22 @@ from sky import global_user_state
23
23
  from sky import sky_logging
24
24
  from sky.adaptors import common as adaptors_common
25
25
  from sky.skylet import constants
26
+ from sky.skylet import runtime_utils
26
27
  from sky.utils import common_utils
27
- from sky.utils import log_utils
28
28
  from sky.utils import message_utils
29
29
  from sky.utils import subprocess_utils
30
30
  from sky.utils.db import db_utils
31
31
 
32
32
  if typing.TYPE_CHECKING:
33
33
  import psutil
34
+
35
+ from sky.schemas.generated import jobsv1_pb2
34
36
  else:
35
37
  psutil = adaptors_common.LazyImport('psutil')
38
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
36
39
 
37
40
  logger = sky_logging.init_logger(__name__)
38
41
 
39
- _LINUX_NEW_LINE = '\n'
40
42
  _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
41
43
  # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
42
44
  # with pid is the same driver process to guard against the case where
@@ -82,13 +84,9 @@ def create_table(cursor, conn):
82
84
  # is not critical and is likely to be enabled by other processes.
83
85
 
84
86
  # Pid column is used for keeping track of the driver process of a job. It
85
- # can be in three states:
86
- # -1: The job was submitted with SkyPilot older than #4318, where we use
87
- # ray job submit to submit the job, i.e. no pid is recorded. This is for
88
- # backward compatibility and should be removed after 0.10.0.
87
+ # can be in two states:
89
88
  # 0: The job driver process has never been started. When adding a job with
90
- # INIT state, the pid will be set to 0 (the default -1 value is just for
91
- # backward compatibility).
89
+ # INIT state, the pid will be set to 0.
92
90
  # >=0: The job has been started. The pid is the driver process's pid.
93
91
  # The driver can be actually running or finished.
94
92
  # TODO(SKY-1213): username is actually user hash, should rename.
@@ -144,7 +142,7 @@ def init_db(func):
144
142
 
145
143
  with _db_init_lock:
146
144
  if _DB is None:
147
- db_path = os.path.expanduser('~/.sky/jobs.db')
145
+ db_path = runtime_utils.get_runtime_dir_path('.sky/jobs.db')
148
146
  os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
149
147
  _DB = db_utils.SQLiteConn(db_path, create_table)
150
148
  return func(*args, **kwargs)
@@ -220,6 +218,45 @@ class JobStatus(enum.Enum):
220
218
  color = _JOB_STATUS_TO_COLOR[self]
221
219
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
222
220
 
221
+ @classmethod
222
+ def from_protobuf(
223
+ cls,
224
+ protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
225
+ """Convert protobuf JobStatus enum to Python enum value."""
226
+ protobuf_to_enum = {
227
+ jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
228
+ jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
229
+ jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
230
+ jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
231
+ jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
232
+ jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
233
+ jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
234
+ jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
235
+ jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
236
+ jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
237
+ }
238
+ if protobuf_value not in protobuf_to_enum:
239
+ raise ValueError(
240
+ f'Unknown protobuf JobStatus value: {protobuf_value}')
241
+ return protobuf_to_enum[protobuf_value]
242
+
243
+ def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
244
+ """Convert this Python enum value to protobuf enum value."""
245
+ enum_to_protobuf = {
246
+ JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
247
+ JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
248
+ JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
249
+ JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
250
+ JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
251
+ JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
252
+ JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
253
+ JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
254
+ JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
255
+ }
256
+ if self not in enum_to_protobuf:
257
+ raise ValueError(f'Unknown JobStatus value: {self}')
258
+ return enum_to_protobuf[self]
259
+
223
260
 
224
261
  # We have two steps for job submissions:
225
262
  # 1. Client reserve a job id from the job table by adding a INIT state job.
@@ -261,11 +298,7 @@ class JobScheduler:
261
298
  f'WHERE job_id={job_id!r}'))
262
299
  _DB.conn.commit()
263
300
  pid = subprocess_utils.launch_new_process_tree(run_cmd)
264
- # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
265
- # This is for the case where the job is submitted with SkyPilot older
266
- # than #4318, using ray job submit.
267
- if 'job submit' in run_cmd:
268
- pid = -1
301
+
269
302
  _DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
270
303
  f'WHERE job_id={job_id!r}'))
271
304
  _DB.conn.commit()
@@ -475,6 +508,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
475
508
 
476
509
  @init_db
477
510
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
511
+ return message_utils.encode_payload(get_statuses(job_ids))
512
+
513
+
514
+ @init_db
515
+ def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
478
516
  assert _DB is not None
479
517
  # Per-job lock is not required here, since the staled job status will not
480
518
  # affect the caller.
@@ -482,10 +520,51 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
482
520
  rows = _DB.cursor.execute(
483
521
  f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
484
522
  job_ids)
485
- statuses = {job_id: None for job_id in job_ids}
523
+ statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
486
524
  for (job_id, status) in rows:
487
525
  statuses[job_id] = status
488
- return message_utils.encode_payload(statuses)
526
+ return statuses
527
+
528
+
529
+ @init_db
530
+ def get_jobs_info(user_hash: Optional[str] = None,
531
+ all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
532
+ """Get detailed job information.
533
+
534
+ Similar to dump_job_queue but returns structured protobuf objects instead
535
+ of encoded strings.
536
+
537
+ Args:
538
+ user_hash: The user hash to show jobs for. Show all the users if None.
539
+ all_jobs: Whether to show all jobs, not just the pending/running ones.
540
+ """
541
+ assert _DB is not None
542
+
543
+ status_list: Optional[List[JobStatus]] = [
544
+ JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
545
+ ]
546
+ if all_jobs:
547
+ status_list = None
548
+
549
+ jobs = _get_jobs(user_hash, status_list=status_list)
550
+ jobs_info = []
551
+ for job in jobs:
552
+ jobs_info.append(
553
+ jobsv1_pb2.JobInfo(job_id=job['job_id'],
554
+ job_name=job['job_name'],
555
+ username=job['username'],
556
+ submitted_at=job['submitted_at'],
557
+ status=job['status'].to_protobuf(),
558
+ run_timestamp=job['run_timestamp'],
559
+ start_at=job['start_at'],
560
+ end_at=job['end_at'],
561
+ resources=job['resources'],
562
+ pid=job['pid'],
563
+ log_path=os.path.join(
564
+ constants.SKY_LOGS_DIRECTORY,
565
+ job['run_timestamp']),
566
+ metadata=json.dumps(job['metadata'])))
567
+ return jobs_info
489
568
 
490
569
 
491
570
  def load_statuses_payload(
@@ -524,16 +603,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
524
603
  PENDING state.
525
604
 
526
605
  The normal job duration will use `start_at` instead of `submitted_at` (in
527
- `format_job_queue()`), because the job may stay in PENDING if the cluster is
528
- busy.
606
+ `table_utils.format_job_queue()`), because the job may stay in PENDING if
607
+ the cluster is busy.
608
+ """
609
+ return message_utils.encode_payload(
610
+ get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
611
+
612
+
613
+ @init_db
614
+ def get_job_submitted_or_ended_timestamp(
615
+ job_id: int, get_ended_time: bool) -> Optional[float]:
616
+ """Get the job submitted timestamp.
617
+
618
+ Returns the raw timestamp or None if job doesn't exist.
529
619
  """
530
620
  assert _DB is not None
531
621
  field = 'end_at' if get_ended_time else 'submitted_at'
532
622
  rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
533
623
  (job_id,))
534
624
  for (timestamp,) in rows:
535
- return message_utils.encode_payload(timestamp)
536
- return message_utils.encode_payload(None)
625
+ return timestamp
626
+ return None
537
627
 
538
628
 
539
629
  def get_ray_port():
@@ -542,7 +632,8 @@ def get_ray_port():
542
632
  If the port file does not exist, the cluster was launched before #1790,
543
633
  return the default port.
544
634
  """
545
- port_path = os.path.expanduser(constants.SKY_REMOTE_RAY_PORT_FILE)
635
+ port_path = runtime_utils.get_runtime_dir_path(
636
+ constants.SKY_REMOTE_RAY_PORT_FILE)
546
637
  if not os.path.exists(port_path):
547
638
  return 6379
548
639
  port = json.load(open(port_path, 'r', encoding='utf-8'))['ray_port']
@@ -555,7 +646,8 @@ def get_job_submission_port():
555
646
  If the port file does not exist, the cluster was launched before #1790,
556
647
  return the default port.
557
648
  """
558
- port_path = os.path.expanduser(constants.SKY_REMOTE_RAY_PORT_FILE)
649
+ port_path = runtime_utils.get_runtime_dir_path(
650
+ constants.SKY_REMOTE_RAY_PORT_FILE)
559
651
  if not os.path.exists(port_path):
560
652
  return 8265
561
653
  port = json.load(open(port_path, 'r',
@@ -673,7 +765,7 @@ def update_job_status(job_ids: List[int],
673
765
  statuses = []
674
766
  for job_id in job_ids:
675
767
  # Per-job status lock is required because between the job status
676
- # query and the job status update, the job status in the databse
768
+ # query and the job status update, the job status in the database
677
769
  # can be modified by the generated ray program.
678
770
  with filelock.FileLock(_get_lock_path(job_id)):
679
771
  status = None
@@ -724,12 +816,6 @@ def update_job_status(job_ids: List[int],
724
816
  'the job state is not in terminal states, setting '
725
817
  'it to FAILED_DRIVER')
726
818
  status = JobStatus.FAILED_DRIVER
727
- elif job_pid < 0:
728
- # TODO(zhwu): Backward compatibility, remove after 0.10.0.
729
- # We set the job status to PENDING instead of actually
730
- # checking ray job status and let the status in job table
731
- # take effect in the later max.
732
- status = JobStatus.PENDING
733
819
 
734
820
  pending_job = _get_pending_job(job_id)
735
821
  if pending_job is not None:
@@ -842,35 +928,6 @@ def is_cluster_idle() -> bool:
842
928
  assert False, 'Should not reach here'
843
929
 
844
930
 
845
- def format_job_queue(jobs: List[Dict[str, Any]]):
846
- """Format the job queue for display.
847
-
848
- Usage:
849
- jobs = get_job_queue()
850
- print(format_job_queue(jobs))
851
- """
852
- job_table = log_utils.create_table([
853
- 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
854
- 'STATUS', 'LOG', 'GIT COMMIT'
855
- ])
856
- for job in jobs:
857
- job_table.add_row([
858
- job['job_id'],
859
- job['job_name'],
860
- job['username'],
861
- log_utils.readable_time_duration(job['submitted_at']),
862
- log_utils.readable_time_duration(job['start_at']),
863
- log_utils.readable_time_duration(job['start_at'],
864
- job['end_at'],
865
- absolute=True),
866
- job['resources'],
867
- job['status'].colored_str(),
868
- job['log_path'],
869
- job.get('metadata', {}).get('git_commit', '-'),
870
- ])
871
- return job_table
872
-
873
-
874
931
  def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
875
932
  """Get the job queue in encoded json format.
876
933
 
@@ -907,27 +964,6 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
907
964
  return jobs
908
965
 
909
966
 
910
- # TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
911
- # after 0.10.0.
912
- def _create_ray_job_submission_client():
913
- """Import the ray job submission client."""
914
- try:
915
- import ray # pylint: disable=import-outside-toplevel
916
- except ImportError:
917
- logger.error('Failed to import ray')
918
- raise
919
- try:
920
- # pylint: disable=import-outside-toplevel
921
- from ray import job_submission
922
- except ImportError:
923
- logger.error(
924
- f'Failed to import job_submission with ray=={ray.__version__}')
925
- raise
926
- port = get_job_submission_port()
927
- return job_submission.JobSubmissionClient(
928
- address=f'http://127.0.0.1:{port}')
929
-
930
-
931
967
  def _make_ray_job_id(sky_job_id: int) -> str:
932
968
  return f'{sky_job_id}-{getpass.getuser()}'
933
969
 
@@ -947,6 +983,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
947
983
  Encoded job IDs that are actually cancelled. Caller should use
948
984
  message_utils.decode_payload() to parse.
949
985
  """
986
+ return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
987
+ user_hash))
988
+
989
+
990
+ def cancel_jobs(jobs: Optional[List[int]],
991
+ cancel_all: bool = False,
992
+ user_hash: Optional[str] = None) -> List[int]:
950
993
  job_records = []
951
994
  all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
952
995
  if jobs is None and not cancel_all:
@@ -989,18 +1032,6 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
989
1032
  # We don't have to start a daemon to forcefully kill the process
990
1033
  # as our job driver process will clean up the underlying
991
1034
  # child processes.
992
- elif job['pid'] < 0:
993
- try:
994
- # TODO(zhwu): Backward compatibility, remove after 0.10.0.
995
- # The job was submitted with ray job submit before #4318.
996
- job_client = _create_ray_job_submission_client()
997
- job_client.stop_job(_make_ray_job_id(job['job_id']))
998
- except RuntimeError as e:
999
- # If the request to the job server fails, we should not
1000
- # set the job to CANCELLED.
1001
- if 'does not exist' not in str(e):
1002
- logger.warning(str(e))
1003
- continue
1004
1035
  # Get the job status again to avoid race condition.
1005
1036
  job_status = get_status_no_lock(job['job_id'])
1006
1037
  if job_status in [
@@ -1010,7 +1041,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
1010
1041
  cancelled_ids.append(job['job_id'])
1011
1042
 
1012
1043
  scheduler.schedule_step()
1013
- return message_utils.encode_payload(cancelled_ids)
1044
+ return cancelled_ids
1014
1045
 
1015
1046
 
1016
1047
  @init_db
@@ -1030,6 +1061,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
1030
1061
 
1031
1062
  @init_db
1032
1063
  def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1064
+ """Returns the relative paths to the log files for jobs with globbing,
1065
+ encoded."""
1066
+ job_to_dir = get_job_log_dirs(job_ids)
1067
+ job_to_dir_str: Dict[str, str] = {}
1068
+ for job_id, log_dir in job_to_dir.items():
1069
+ job_to_dir_str[str(job_id)] = log_dir
1070
+ return message_utils.encode_payload(job_to_dir_str)
1071
+
1072
+
1073
+ @init_db
1074
+ def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
1033
1075
  """Returns the relative paths to the log files for jobs with globbing."""
1034
1076
  assert _DB is not None
1035
1077
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
@@ -1038,16 +1080,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1038
1080
  SELECT * FROM jobs
1039
1081
  WHERE {query_str}""", job_ids)
1040
1082
  rows = _DB.cursor.fetchall()
1041
- job_to_dir = {}
1083
+ job_to_dir: Dict[int, str] = {}
1042
1084
  for row in rows:
1043
1085
  job_id = row[JobInfoLoc.JOB_ID.value]
1044
1086
  if row[JobInfoLoc.LOG_PATH.value]:
1045
- job_to_dir[str(job_id)] = row[JobInfoLoc.LOG_PATH.value]
1087
+ job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
1046
1088
  else:
1047
1089
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1048
- job_to_dir[str(job_id)] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1049
- run_timestamp)
1050
- return message_utils.encode_payload(job_to_dir)
1090
+ job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1091
+ run_timestamp)
1092
+ return job_to_dir
1051
1093
 
1052
1094
 
1053
1095
  class JobLibCodeGen:
@@ -1176,15 +1218,10 @@ class JobLibCodeGen:
1176
1218
  f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
1177
1219
  ),
1178
1220
  # Add a newline to leave the if indent block above.
1179
- f'\ntail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1180
- f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1181
- f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
1221
+ f'\nlog_lib.tail_logs(job_id=job_id, log_dir=log_dir, managed_job_id={managed_job_id!r}, follow={follow}, tail={tail})',
1182
1222
  # After tailing, check the job status and exit with appropriate code
1183
1223
  'job_status = job_lib.get_status(job_id)',
1184
- # Backward compatibility for returning exit code: Skylet versions 2
1185
- # and older did not have JobExitCode, so we use 0 for those versions
1186
- # TODO: Remove this special handling after 0.10.0.
1187
- 'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
1224
+ 'exit_code = exceptions.JobExitCode.from_job_status(job_status)',
1188
1225
  # Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
1189
1226
  # exit with success (0) since fetching current logs is a successful operation.
1190
1227
  # This prevents shell wrappers from printing "command terminated with exit code 101".
sky/skylet/log_lib.py CHANGED
@@ -8,11 +8,13 @@ import functools
8
8
  import io
9
9
  import multiprocessing.pool
10
10
  import os
11
+ import queue as queue_lib
11
12
  import shlex
12
13
  import subprocess
13
14
  import sys
14
15
  import tempfile
15
16
  import textwrap
17
+ import threading
16
18
  import time
17
19
  from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
18
20
  Tuple, Union)
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
39
41
 
40
42
  LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
41
43
 
44
+ # 16-64KiB seems to be the sweet spot:
45
+ # https://github.com/grpc/grpc.github.io/issues/371
46
+ # TODO(kevin): Benchmark this ourselves and verify.
47
+ DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
48
+
42
49
 
43
50
  class _ProcessingArgs:
44
51
  """Arguments for processing logs."""
@@ -213,7 +220,14 @@ def run_with_log(
213
220
  stdin=stdin,
214
221
  **kwargs) as proc:
215
222
  try:
216
- subprocess_utils.kill_process_daemon(proc.pid)
223
+ if ctx is not None:
224
+ # When runs in coroutine, use kill_pg if available to avoid
225
+ # the overhead of refreshing the process tree in the daemon.
226
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
227
+ else:
228
+ # For backward compatibility, do not specify use_kill_pg by
229
+ # default.
230
+ subprocess_utils.kill_process_daemon(proc.pid)
217
231
  stdout = ''
218
232
  stderr = ''
219
233
  stdout_stream_handler = None
@@ -264,7 +278,6 @@ def run_with_log(
264
278
  stdout, stderr = context_utils.pipe_and_wait_process(
265
279
  ctx,
266
280
  proc,
267
- cancel_callback=subprocess_utils.kill_children_processes,
268
281
  stdout_stream_handler=stdout_stream_handler,
269
282
  stderr_stream_handler=stderr_stream_handler)
270
283
  elif process_stream:
@@ -354,6 +367,17 @@ def run_bash_command_with_log(bash_command: str,
354
367
  shell=True)
355
368
 
356
369
 
370
+ def run_bash_command_with_log_and_return_pid(
371
+ bash_command: str,
372
+ log_path: str,
373
+ env_vars: Optional[Dict[str, str]] = None,
374
+ stream_logs: bool = False,
375
+ with_ray: bool = False):
376
+ return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
377
+ stream_logs, with_ray)
378
+ return {'return_code': return_code, 'pid': os.getpid()}
379
+
380
+
357
381
  def _follow_job_logs(file,
358
382
  job_id: int,
359
383
  start_streaming: bool,
@@ -395,9 +419,9 @@ def _follow_job_logs(file,
395
419
  wait_last_logs = False
396
420
  continue
397
421
  status_str = status.value if status is not None else 'None'
398
- print(ux_utils.finishing_message(
399
- f'Job finished (status: {status_str}).'),
400
- flush=True)
422
+ finish = ux_utils.finishing_message(
423
+ f'Job finished (status: {status_str}).')
424
+ yield finish + '\n'
401
425
  return
402
426
 
403
427
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
@@ -552,3 +576,207 @@ def tail_logs(job_id: Optional[int],
552
576
  except FileNotFoundError:
553
577
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
554
578
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
579
+
580
+
581
+ def tail_logs_iter(job_id: Optional[int],
582
+ log_dir: Optional[str],
583
+ managed_job_id: Optional[int] = None,
584
+ follow: bool = True,
585
+ tail: int = 0) -> Iterator[str]:
586
+ """Tail the logs of a job. This is mostly the same as tail_logs, but
587
+ returns an iterator instead of printing to stdout/stderr."""
588
+ if job_id is None:
589
+ # This only happens when job_lib.get_latest_job_id() returns None,
590
+ # which means no job has been submitted to this cluster. See
591
+ # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
592
+ logger.info('Skip streaming logs as no job has been submitted.')
593
+ return
594
+ job_str = f'job {job_id}'
595
+ if managed_job_id is not None:
596
+ job_str = f'managed job {managed_job_id}'
597
+ if log_dir is None:
598
+ msg = f'{job_str.capitalize()} not found (see `sky queue`).'
599
+ yield msg + '\n'
600
+ return
601
+ logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
602
+ f'{managed_job_id}.')
603
+ log_path = os.path.join(log_dir, 'run.log')
604
+ log_path = os.path.expanduser(log_path)
605
+
606
+ status = job_lib.update_job_status([job_id], silent=True)[0]
607
+
608
+ # Wait for the log to be written. This is needed due to the `ray submit`
609
+ # will take some time to start the job and write the log.
610
+ retry_cnt = 0
611
+ while status is not None and not status.is_terminal():
612
+ retry_cnt += 1
613
+ if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
614
+ break
615
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
616
+ err = (f'{colorama.Fore.RED}ERROR: Logs for '
617
+ f'{job_str} (status: {status.value}) does not exist '
618
+ f'after retrying {retry_cnt} times.'
619
+ f'{colorama.Style.RESET_ALL}')
620
+ yield err + '\n'
621
+ return
622
+ waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
623
+ 'to be written...')
624
+ yield waiting + '\n'
625
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
626
+ status = job_lib.update_job_status([job_id], silent=True)[0]
627
+
628
+ start_stream_at = LOG_FILE_START_STREAMING_AT
629
+ # Explicitly declare the type to avoid mypy warning.
630
+ lines: Iterable[str] = []
631
+ if follow and status in [
632
+ job_lib.JobStatus.SETTING_UP,
633
+ job_lib.JobStatus.PENDING,
634
+ job_lib.JobStatus.RUNNING,
635
+ ]:
636
+ # Not using `ray job logs` because it will put progress bar in
637
+ # multiple lines.
638
+ with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
639
+ # Using `_follow` instead of `tail -f` to streaming the whole
640
+ # log and creating a new process for tail.
641
+ start_streaming = False
642
+ if tail > 0:
643
+ head_lines_of_log_file = _peek_head_lines(log_file)
644
+ lines = collections.deque(log_file, maxlen=tail)
645
+ start_streaming = _should_stream_the_whole_tail_lines(
646
+ head_lines_of_log_file, lines, start_stream_at)
647
+ for line in lines:
648
+ if start_stream_at in line:
649
+ start_streaming = True
650
+ if start_streaming:
651
+ yield line
652
+ # Now, the cursor is at the end of the last lines
653
+ # if tail > 0
654
+ for line in _follow_job_logs(log_file,
655
+ job_id=job_id,
656
+ start_streaming=start_streaming,
657
+ start_streaming_at=start_stream_at):
658
+ yield line
659
+ else:
660
+ try:
661
+ start_streaming = False
662
+ with open(log_path, 'r', encoding='utf-8') as log_file:
663
+ if tail > 0:
664
+ # If tail > 0, we need to read the last n lines.
665
+ # We use double ended queue to rotate the last n lines.
666
+ head_lines_of_log_file = _peek_head_lines(log_file)
667
+ lines = collections.deque(log_file, maxlen=tail)
668
+ start_streaming = _should_stream_the_whole_tail_lines(
669
+ head_lines_of_log_file, lines, start_stream_at)
670
+ else:
671
+ lines = log_file
672
+ for line in lines:
673
+ if start_stream_at in line:
674
+ start_streaming = True
675
+ if start_streaming:
676
+ yield line
677
+ status_str = status.value if status is not None else 'None'
678
+ # Only show "Job finished" for actually terminal states
679
+ if status is not None and status.is_terminal():
680
+ finish = ux_utils.finishing_message(
681
+ f'Job finished (status: {status_str}).')
682
+ yield finish + '\n'
683
+ return
684
+ except FileNotFoundError:
685
+ err = (
686
+ f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
687
+ f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
688
+ yield err + '\n'
689
+
690
+
691
+ class LogBuffer:
692
+ """In-memory buffer for chunking log lines for streaming."""
693
+
694
+ def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
695
+ """Initialize the log buffer.
696
+
697
+ Args:
698
+ max_chars: Maximum buffer size (in characters, not bytes) before
699
+ flushing. The actual amount of bytes (UTF-8 encoding)
700
+ could be more than this, depending on the characters,
701
+ i.e. ASCII characters take 1 byte, while others
702
+ may take 2-4 bytes. But this is fine as our default
703
+ chunk size is well below the default value of
704
+ grpc.max_receive_message_length which is 4MB.
705
+ """
706
+ self.max_chars = max_chars
707
+ self._buffer = io.StringIO()
708
+
709
+ def _should_flush(self) -> bool:
710
+ return self._buffer.tell() >= self.max_chars
711
+
712
+ def flush(self) -> str:
713
+ """Get the current buffered content and clear the buffer.
714
+
715
+ Returns:
716
+ The buffered log lines as a single string
717
+ """
718
+ if not self._buffer.tell():
719
+ return ''
720
+ chunk = self._buffer.getvalue()
721
+ self._buffer.truncate(0)
722
+ self._buffer.seek(0)
723
+ return chunk
724
+
725
+ def write(self, line: str) -> bool:
726
+ """Add a line to the buffer.
727
+
728
+ Args:
729
+ line: The log line to add
730
+
731
+ Returns:
732
+ True if buffer should be flushed after adding the line
733
+ """
734
+ self._buffer.write(line)
735
+ return self._should_flush()
736
+
737
+ def close(self):
738
+ self._buffer.close()
739
+
740
+
741
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
742
+ timeout: float) -> Iterable[str]:
743
+ """Iterates over an iterable, writing each item to a buffer,
744
+ and flushing the buffer when it is full or no item is
745
+ yielded within the timeout duration."""
746
+ # TODO(kevin): Simplify this using asyncio.timeout, once we move
747
+ # the skylet event loop and gRPC server to asyncio.
748
+ # https://docs.python.org/3/library/asyncio-task.html#timeouts
749
+
750
+ queue: queue_lib.Queue = queue_lib.Queue()
751
+ sentinel = object()
752
+
753
+ def producer():
754
+ try:
755
+ for item in iterable:
756
+ queue.put(item)
757
+ finally:
758
+ queue.put(sentinel)
759
+
760
+ thread = threading.Thread(target=producer, daemon=True)
761
+ thread.start()
762
+
763
+ while True:
764
+ try:
765
+ item = queue.get(timeout=timeout)
766
+ except queue_lib.Empty:
767
+ out = buffer.flush()
768
+ if out:
769
+ yield out
770
+ continue
771
+
772
+ if item is sentinel:
773
+ thread.join()
774
+ out = buffer.flush()
775
+ if out:
776
+ yield out
777
+ return
778
+
779
+ if buffer.write(item):
780
+ out = buffer.flush()
781
+ if out:
782
+ yield out