skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/job_lib.py CHANGED
@@ -3,6 +3,7 @@
3
3
  This is a remote utility module that provides job queue functionality.
4
4
  """
5
5
  import enum
6
+ import functools
6
7
  import getpass
7
8
  import json
8
9
  import os
@@ -10,9 +11,10 @@ import pathlib
10
11
  import shlex
11
12
  import signal
12
13
  import sqlite3
14
+ import threading
13
15
  import time
14
16
  import typing
15
- from typing import Any, Dict, List, Optional, Sequence
17
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
16
18
 
17
19
  import colorama
18
20
  import filelock
@@ -21,20 +23,22 @@ from sky import global_user_state
21
23
  from sky import sky_logging
22
24
  from sky.adaptors import common as adaptors_common
23
25
  from sky.skylet import constants
26
+ from sky.skylet import runtime_utils
24
27
  from sky.utils import common_utils
25
- from sky.utils import db_utils
26
- from sky.utils import log_utils
27
28
  from sky.utils import message_utils
28
29
  from sky.utils import subprocess_utils
30
+ from sky.utils.db import db_utils
29
31
 
30
32
  if typing.TYPE_CHECKING:
31
33
  import psutil
34
+
35
+ from sky.schemas.generated import jobsv1_pb2
32
36
  else:
33
37
  psutil = adaptors_common.LazyImport('psutil')
38
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
34
39
 
35
40
  logger = sky_logging.init_logger(__name__)
36
41
 
37
- _LINUX_NEW_LINE = '\n'
38
42
  _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
39
43
  # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
40
44
  # with pid is the same driver process to guard against the case where
@@ -60,10 +64,8 @@ class JobInfoLoc(enum.IntEnum):
60
64
  END_AT = 7
61
65
  RESOURCES = 8
62
66
  PID = 9
63
-
64
-
65
- _DB_PATH = os.path.expanduser('~/.sky/jobs.db')
66
- os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
67
+ LOG_PATH = 10
68
+ METADATA = 11
67
69
 
68
70
 
69
71
  def create_table(cursor, conn):
@@ -82,13 +84,9 @@ def create_table(cursor, conn):
82
84
  # is not critical and is likely to be enabled by other processes.
83
85
 
84
86
  # Pid column is used for keeping track of the driver process of a job. It
85
- # can be in three states:
86
- # -1: The job was submitted with SkyPilot older than #4318, where we use
87
- # ray job submit to submit the job, i.e. no pid is recorded. This is for
88
- # backward compatibility and should be removed after 0.10.0.
87
+ # can be in two states:
89
88
  # 0: The job driver process has never been started. When adding a job with
90
- # INIT state, the pid will be set to 0 (the default -1 value is just for
91
- # backward compatibility).
89
+ # INIT state, the pid will be set to 0.
92
90
  # >=0: The job has been started. The pid is the driver process's pid.
93
91
  # The driver can be actually running or finished.
94
92
  # TODO(SKY-1213): username is actually user hash, should rename.
@@ -103,7 +101,9 @@ def create_table(cursor, conn):
103
101
  start_at FLOAT DEFAULT -1,
104
102
  end_at FLOAT DEFAULT NULL,
105
103
  resources TEXT DEFAULT NULL,
106
- pid INTEGER DEFAULT -1)""")
104
+ pid INTEGER DEFAULT -1,
105
+ log_dir TEXT DEFAULT NULL,
106
+ metadata TEXT DEFAULT '{}')""")
107
107
 
108
108
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
109
109
  job_id INTEGER,
@@ -116,12 +116,38 @@ def create_table(cursor, conn):
116
116
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
117
117
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
118
118
  'INTEGER DEFAULT -1')
119
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
120
+ 'TEXT DEFAULT NULL')
121
+ db_utils.add_column_to_table(cursor,
122
+ conn,
123
+ 'jobs',
124
+ 'metadata',
125
+ 'TEXT DEFAULT \'{}\'',
126
+ value_to_replace_existing_entries='{}')
119
127
  conn.commit()
120
128
 
121
129
 
122
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
123
- _CURSOR = _DB.cursor
124
- _CONN = _DB.conn
130
+ _DB = None
131
+ _db_init_lock = threading.Lock()
132
+
133
+
134
+ def init_db(func):
135
+ """Initialize the database."""
136
+
137
+ @functools.wraps(func)
138
+ def wrapper(*args, **kwargs):
139
+ global _DB
140
+ if _DB is not None:
141
+ return func(*args, **kwargs)
142
+
143
+ with _db_init_lock:
144
+ if _DB is None:
145
+ db_path = runtime_utils.get_runtime_dir_path('.sky/jobs.db')
146
+ os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
147
+ _DB = db_utils.SQLiteConn(db_path, create_table)
148
+ return func(*args, **kwargs)
149
+
150
+ return wrapper
125
151
 
126
152
 
127
153
  class JobStatus(enum.Enum):
@@ -192,6 +218,45 @@ class JobStatus(enum.Enum):
192
218
  color = _JOB_STATUS_TO_COLOR[self]
193
219
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
194
220
 
221
+ @classmethod
222
+ def from_protobuf(
223
+ cls,
224
+ protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
225
+ """Convert protobuf JobStatus enum to Python enum value."""
226
+ protobuf_to_enum = {
227
+ jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
228
+ jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
229
+ jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
230
+ jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
231
+ jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
232
+ jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
233
+ jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
234
+ jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
235
+ jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
236
+ jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
237
+ }
238
+ if protobuf_value not in protobuf_to_enum:
239
+ raise ValueError(
240
+ f'Unknown protobuf JobStatus value: {protobuf_value}')
241
+ return protobuf_to_enum[protobuf_value]
242
+
243
+ def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
244
+ """Convert this Python enum value to protobuf enum value."""
245
+ enum_to_protobuf = {
246
+ JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
247
+ JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
248
+ JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
249
+ JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
250
+ JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
251
+ JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
252
+ JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
253
+ JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
254
+ JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
255
+ }
256
+ if self not in enum_to_protobuf:
257
+ raise ValueError(f'Unknown JobStatus value: {self}')
258
+ return enum_to_protobuf[self]
259
+
195
260
 
196
261
  # We have two steps for job submissions:
197
262
  # 1. Client reserve a job id from the job table by adding a INIT state job.
@@ -210,30 +275,33 @@ _PRE_RESOURCE_STATUSES = [JobStatus.PENDING]
210
275
  class JobScheduler:
211
276
  """Base class for job scheduler"""
212
277
 
278
+ @init_db
213
279
  def queue(self, job_id: int, cmd: str) -> None:
214
- _CURSOR.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
215
- (job_id, cmd, 0, int(time.time())))
216
- _CONN.commit()
280
+ assert _DB is not None
281
+ _DB.cursor.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
282
+ (job_id, cmd, 0, int(time.time())))
283
+ _DB.conn.commit()
217
284
  set_status(job_id, JobStatus.PENDING)
218
285
  self.schedule_step()
219
286
 
287
+ @init_db
220
288
  def remove_job_no_lock(self, job_id: int) -> None:
221
- _CURSOR.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
222
- _CONN.commit()
289
+ assert _DB is not None
290
+ _DB.cursor.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
291
+ _DB.conn.commit()
223
292
 
293
+ @init_db
224
294
  def _run_job(self, job_id: int, run_cmd: str):
225
- _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} '
226
- f'WHERE job_id={job_id!r}'))
227
- _CONN.commit()
295
+ assert _DB is not None
296
+ _DB.cursor.execute(
297
+ (f'UPDATE pending_jobs SET submit={int(time.time())} '
298
+ f'WHERE job_id={job_id!r}'))
299
+ _DB.conn.commit()
228
300
  pid = subprocess_utils.launch_new_process_tree(run_cmd)
229
- # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
230
- # This is for the case where the job is submitted with SkyPilot older
231
- # than #4318, using ray job submit.
232
- if 'job submit' in run_cmd:
233
- pid = -1
234
- _CURSOR.execute((f'UPDATE jobs SET pid={pid} '
235
- f'WHERE job_id={job_id!r}'))
236
- _CONN.commit()
301
+
302
+ _DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
303
+ f'WHERE job_id={job_id!r}'))
304
+ _DB.conn.commit()
237
305
 
238
306
  def schedule_step(self, force_update_jobs: bool = False) -> None:
239
307
  if force_update_jobs:
@@ -282,8 +350,10 @@ class JobScheduler:
282
350
  class FIFOScheduler(JobScheduler):
283
351
  """First in first out job scheduler"""
284
352
 
353
+ @init_db
285
354
  def _get_pending_job_ids(self) -> List[int]:
286
- rows = _CURSOR.execute(
355
+ assert _DB is not None
356
+ rows = _DB.cursor.execute(
287
357
  'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
288
358
  return [row[0] for row in rows]
289
359
 
@@ -308,26 +378,67 @@ def make_job_command_with_user_switching(username: str,
308
378
  return ['sudo', '-H', 'su', '--login', username, '-c', command]
309
379
 
310
380
 
311
- def add_job(job_name: str, username: str, run_timestamp: str,
312
- resources_str: str) -> int:
381
+ @init_db
382
+ def add_job(job_name: str,
383
+ username: str,
384
+ run_timestamp: str,
385
+ resources_str: str,
386
+ metadata: str = '{}') -> Tuple[int, str]:
313
387
  """Atomically reserve the next available job id for the user."""
388
+ assert _DB is not None
314
389
  job_submitted_at = time.time()
315
390
  # job_id will autoincrement with the null value
316
- _CURSOR.execute(
317
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
391
+ _DB.cursor.execute(
392
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
318
393
  (job_name, username, job_submitted_at, JobStatus.INIT.value,
319
- run_timestamp, None, resources_str))
320
- _CONN.commit()
321
- rows = _CURSOR.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
322
- (run_timestamp,))
394
+ run_timestamp, None, resources_str, metadata))
395
+ _DB.conn.commit()
396
+ rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
397
+ (run_timestamp,))
323
398
  for row in rows:
324
399
  job_id = row[0]
325
400
  assert job_id is not None
326
- return job_id
401
+ log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
402
+ set_log_dir_no_lock(job_id, log_dir)
403
+ return job_id, log_dir
327
404
 
328
405
 
406
+ @init_db
407
+ def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
408
+ """Set the log directory for the job.
409
+
410
+ We persist the log directory for the job to allow changing the log directory
411
+ generation logic over versions.
412
+
413
+ Args:
414
+ job_id: The ID of the job.
415
+ log_dir: The log directory for the job.
416
+ """
417
+ assert _DB is not None
418
+ _DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
419
+ (log_dir, job_id))
420
+ _DB.conn.commit()
421
+
422
+
423
+ @init_db
424
+ def get_log_dir_for_job(job_id: int) -> Optional[str]:
425
+ """Get the log directory for the job.
426
+
427
+ Args:
428
+ job_id: The ID of the job.
429
+ """
430
+ assert _DB is not None
431
+ rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
432
+ (job_id,))
433
+ for row in rows:
434
+ return row[0]
435
+ return None
436
+
437
+
438
+ @init_db
329
439
  def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
330
440
  """Setting the status of the job in the database."""
441
+ assert _DB is not None
331
442
  assert status != JobStatus.RUNNING, (
332
443
  'Please use set_job_started() to set job status to RUNNING')
333
444
  if status.is_terminal():
@@ -339,15 +450,15 @@ def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
339
450
  check_end_at_str = ' AND end_at IS NULL'
340
451
  if status != JobStatus.FAILED_SETUP:
341
452
  check_end_at_str = ''
342
- _CURSOR.execute(
453
+ _DB.cursor.execute(
343
454
  'UPDATE jobs SET status=(?), end_at=(?) '
344
455
  f'WHERE job_id=(?) {check_end_at_str}',
345
456
  (status.value, end_at, job_id))
346
457
  else:
347
- _CURSOR.execute(
458
+ _DB.cursor.execute(
348
459
  'UPDATE jobs SET status=(?), end_at=NULL '
349
460
  'WHERE job_id=(?)', (status.value, job_id))
350
- _CONN.commit()
461
+ _DB.conn.commit()
351
462
 
352
463
 
353
464
  def set_status(job_id: int, status: JobStatus) -> None:
@@ -357,16 +468,19 @@ def set_status(job_id: int, status: JobStatus) -> None:
357
468
  _set_status_no_lock(job_id, status)
358
469
 
359
470
 
471
+ @init_db
360
472
  def set_job_started(job_id: int) -> None:
361
473
  # TODO(mraheja): remove pylint disabling when filelock version updated.
362
474
  # pylint: disable=abstract-class-instantiated
475
+ assert _DB is not None
363
476
  with filelock.FileLock(_get_lock_path(job_id)):
364
- _CURSOR.execute(
477
+ _DB.cursor.execute(
365
478
  'UPDATE jobs SET status=(?), start_at=(?), end_at=NULL '
366
479
  'WHERE job_id=(?)', (JobStatus.RUNNING.value, time.time(), job_id))
367
- _CONN.commit()
480
+ _DB.conn.commit()
368
481
 
369
482
 
483
+ @init_db
370
484
  def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
371
485
  """Get the status of the job with the given id.
372
486
 
@@ -375,8 +489,9 @@ def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
375
489
  the status in a while loop as in `log_lib._follow_job_logs`. Otherwise, use
376
490
  `get_status`.
377
491
  """
378
- rows = _CURSOR.execute('SELECT status FROM jobs WHERE job_id=(?)',
379
- (job_id,))
492
+ assert _DB is not None
493
+ rows = _DB.cursor.execute('SELECT status FROM jobs WHERE job_id=(?)',
494
+ (job_id,))
380
495
  for (status,) in rows:
381
496
  if status is None:
382
497
  return None
@@ -391,17 +506,65 @@ def get_status(job_id: int) -> Optional[JobStatus]:
391
506
  return get_status_no_lock(job_id)
392
507
 
393
508
 
509
+ @init_db
394
510
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
511
+ return message_utils.encode_payload(get_statuses(job_ids))
512
+
513
+
514
+ @init_db
515
+ def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
516
+ assert _DB is not None
395
517
  # Per-job lock is not required here, since the staled job status will not
396
518
  # affect the caller.
397
519
  query_str = ','.join(['?'] * len(job_ids))
398
- rows = _CURSOR.execute(
520
+ rows = _DB.cursor.execute(
399
521
  f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
400
522
  job_ids)
401
- statuses = {job_id: None for job_id in job_ids}
523
+ statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
402
524
  for (job_id, status) in rows:
403
525
  statuses[job_id] = status
404
- return message_utils.encode_payload(statuses)
526
+ return statuses
527
+
528
+
529
+ @init_db
530
+ def get_jobs_info(user_hash: Optional[str] = None,
531
+ all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
532
+ """Get detailed job information.
533
+
534
+ Similar to dump_job_queue but returns structured protobuf objects instead
535
+ of encoded strings.
536
+
537
+ Args:
538
+ user_hash: The user hash to show jobs for. Show all the users if None.
539
+ all_jobs: Whether to show all jobs, not just the pending/running ones.
540
+ """
541
+ assert _DB is not None
542
+
543
+ status_list: Optional[List[JobStatus]] = [
544
+ JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
545
+ ]
546
+ if all_jobs:
547
+ status_list = None
548
+
549
+ jobs = _get_jobs(user_hash, status_list=status_list)
550
+ jobs_info = []
551
+ for job in jobs:
552
+ jobs_info.append(
553
+ jobsv1_pb2.JobInfo(job_id=job['job_id'],
554
+ job_name=job['job_name'],
555
+ username=job['username'],
556
+ submitted_at=job['submitted_at'],
557
+ status=job['status'].to_protobuf(),
558
+ run_timestamp=job['run_timestamp'],
559
+ start_at=job['start_at'],
560
+ end_at=job['end_at'],
561
+ resources=job['resources'],
562
+ pid=job['pid'],
563
+ log_path=os.path.join(
564
+ constants.SKY_LOGS_DIRECTORY,
565
+ job['run_timestamp']),
566
+ metadata=json.dumps(job['metadata'])))
567
+ return jobs_info
405
568
 
406
569
 
407
570
  def load_statuses_payload(
@@ -419,14 +582,17 @@ def load_statuses_payload(
419
582
  return statuses
420
583
 
421
584
 
585
+ @init_db
422
586
  def get_latest_job_id() -> Optional[int]:
423
- rows = _CURSOR.execute(
587
+ assert _DB is not None
588
+ rows = _DB.cursor.execute(
424
589
  'SELECT job_id FROM jobs ORDER BY job_id DESC LIMIT 1')
425
590
  for (job_id,) in rows:
426
591
  return job_id
427
592
  return None
428
593
 
429
594
 
595
+ @init_db
430
596
  def get_job_submitted_or_ended_timestamp_payload(job_id: int,
431
597
  get_ended_time: bool) -> str:
432
598
  """Get the job submitted/ended timestamp.
@@ -437,15 +603,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
437
603
  PENDING state.
438
604
 
439
605
  The normal job duration will use `start_at` instead of `submitted_at` (in
440
- `format_job_queue()`), because the job may stay in PENDING if the cluster is
441
- busy.
606
+ `table_utils.format_job_queue()`), because the job may stay in PENDING if
607
+ the cluster is busy.
608
+ """
609
+ return message_utils.encode_payload(
610
+ get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
611
+
612
+
613
+ @init_db
614
+ def get_job_submitted_or_ended_timestamp(
615
+ job_id: int, get_ended_time: bool) -> Optional[float]:
616
+ """Get the job submitted timestamp.
617
+
618
+ Returns the raw timestamp or None if job doesn't exist.
442
619
  """
620
+ assert _DB is not None
443
621
  field = 'end_at' if get_ended_time else 'submitted_at'
444
- rows = _CURSOR.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
445
- (job_id,))
622
+ rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
623
+ (job_id,))
446
624
  for (timestamp,) in rows:
447
- return message_utils.encode_payload(timestamp)
448
- return message_utils.encode_payload(None)
625
+ return timestamp
626
+ return None
449
627
 
450
628
 
451
629
  def get_ray_port():
@@ -454,7 +632,8 @@ def get_ray_port():
454
632
  If the port file does not exist, the cluster was launched before #1790,
455
633
  return the default port.
456
634
  """
457
- port_path = os.path.expanduser(constants.SKY_REMOTE_RAY_PORT_FILE)
635
+ port_path = runtime_utils.get_runtime_dir_path(
636
+ constants.SKY_REMOTE_RAY_PORT_FILE)
458
637
  if not os.path.exists(port_path):
459
638
  return 6379
460
639
  port = json.load(open(port_path, 'r', encoding='utf-8'))['ray_port']
@@ -467,7 +646,8 @@ def get_job_submission_port():
467
646
  If the port file does not exist, the cluster was launched before #1790,
468
647
  return the default port.
469
648
  """
470
- port_path = os.path.expanduser(constants.SKY_REMOTE_RAY_PORT_FILE)
649
+ port_path = runtime_utils.get_runtime_dir_path(
650
+ constants.SKY_REMOTE_RAY_PORT_FILE)
471
651
  if not os.path.exists(port_path):
472
652
  return 8265
473
653
  port = json.load(open(port_path, 'r',
@@ -492,14 +672,17 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
492
672
  'end_at': row[JobInfoLoc.END_AT.value],
493
673
  'resources': row[JobInfoLoc.RESOURCES.value],
494
674
  'pid': row[JobInfoLoc.PID.value],
675
+ 'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
495
676
  })
496
677
  return records
497
678
 
498
679
 
680
+ @init_db
499
681
  def _get_jobs(
500
682
  user_hash: Optional[str],
501
683
  status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]:
502
684
  """Returns jobs with the given fields, sorted by job_id, descending."""
685
+ assert _DB is not None
503
686
  if status_list is None:
504
687
  status_list = list(JobStatus)
505
688
  status_str_list = [repr(status.value) for status in status_list]
@@ -509,14 +692,16 @@ def _get_jobs(
509
692
  # We use the old username field for compatibility.
510
693
  filter_str += ' AND username=(?)'
511
694
  params.append(user_hash)
512
- rows = _CURSOR.execute(
695
+ rows = _DB.cursor.execute(
513
696
  f'SELECT * FROM jobs {filter_str} ORDER BY job_id DESC', params)
514
697
  records = _get_records_from_rows(rows)
515
698
  return records
516
699
 
517
700
 
701
+ @init_db
518
702
  def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
519
- rows = _CURSOR.execute(
703
+ assert _DB is not None
704
+ rows = _DB.cursor.execute(
520
705
  f"""\
521
706
  SELECT * FROM jobs
522
707
  WHERE job_id IN ({','.join(['?'] * len(job_ids))})
@@ -527,8 +712,10 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
527
712
  return records
528
713
 
529
714
 
715
+ @init_db
530
716
  def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
531
- rows = _CURSOR.execute(
717
+ assert _DB is not None
718
+ rows = _DB.cursor.execute(
532
719
  'SELECT created_time, submit, run_cmd FROM pending_jobs '
533
720
  f'WHERE job_id={job_id!r}')
534
721
  for row in rows:
@@ -578,7 +765,7 @@ def update_job_status(job_ids: List[int],
578
765
  statuses = []
579
766
  for job_id in job_ids:
580
767
  # Per-job status lock is required because between the job status
581
- # query and the job status update, the job status in the databse
768
+ # query and the job status update, the job status in the database
582
769
  # can be modified by the generated ray program.
583
770
  with filelock.FileLock(_get_lock_path(job_id)):
584
771
  status = None
@@ -629,12 +816,6 @@ def update_job_status(job_ids: List[int],
629
816
  'the job state is not in terminal states, setting '
630
817
  'it to FAILED_DRIVER')
631
818
  status = JobStatus.FAILED_DRIVER
632
- elif job_pid < 0:
633
- # TODO(zhwu): Backward compatibility, remove after 0.10.0.
634
- # We set the job status to PENDING instead of actually
635
- # checking ray job status and let the status in job table
636
- # take effect in the later max.
637
- status = JobStatus.PENDING
638
819
 
639
820
  pending_job = _get_pending_job(job_id)
640
821
  if pending_job is not None:
@@ -698,19 +879,29 @@ def update_job_status(job_ids: List[int],
698
879
  return statuses
699
880
 
700
881
 
882
+ @init_db
701
883
  def fail_all_jobs_in_progress() -> None:
884
+ assert _DB is not None
702
885
  in_progress_status = [
703
886
  status.value for status in JobStatus.nonterminal_statuses()
704
887
  ]
705
- _CURSOR.execute(
888
+ _DB.cursor.execute(
706
889
  f"""\
707
890
  UPDATE jobs SET status=(?)
708
891
  WHERE status IN ({','.join(['?'] * len(in_progress_status))})
709
892
  """, (JobStatus.FAILED_DRIVER.value, *in_progress_status))
710
- _CONN.commit()
893
+ _DB.conn.commit()
711
894
 
712
895
 
713
896
  def update_status() -> None:
897
+ # This signal file suggests that the controller is recovering from a
898
+ # failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
899
+ # details. When recovering, we should not update the job status to failed
900
+ # driver as they will be recovered later.
901
+ if os.path.exists(
902
+ os.path.expanduser(
903
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
904
+ return
714
905
  # This will be called periodically by the skylet to update the status
715
906
  # of the jobs in the database, to avoid stale job status.
716
907
  nonterminal_jobs = _get_jobs(user_hash=None,
@@ -720,12 +911,14 @@ def update_status() -> None:
720
911
  update_job_status(nonterminal_job_ids)
721
912
 
722
913
 
914
+ @init_db
723
915
  def is_cluster_idle() -> bool:
724
916
  """Returns if the cluster is idle (no in-flight jobs)."""
917
+ assert _DB is not None
725
918
  in_progress_status = [
726
919
  status.value for status in JobStatus.nonterminal_statuses()
727
920
  ]
728
- rows = _CURSOR.execute(
921
+ rows = _DB.cursor.execute(
729
922
  f"""\
730
923
  SELECT COUNT(*) FROM jobs
731
924
  WHERE status IN ({','.join(['?'] * len(in_progress_status))})
@@ -735,34 +928,6 @@ def is_cluster_idle() -> bool:
735
928
  assert False, 'Should not reach here'
736
929
 
737
930
 
738
- def format_job_queue(jobs: List[Dict[str, Any]]):
739
- """Format the job queue for display.
740
-
741
- Usage:
742
- jobs = get_job_queue()
743
- print(format_job_queue(jobs))
744
- """
745
- job_table = log_utils.create_table([
746
- 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
747
- 'STATUS', 'LOG'
748
- ])
749
- for job in jobs:
750
- job_table.add_row([
751
- job['job_id'],
752
- job['job_name'],
753
- job['username'],
754
- log_utils.readable_time_duration(job['submitted_at']),
755
- log_utils.readable_time_duration(job['start_at']),
756
- log_utils.readable_time_duration(job['start_at'],
757
- job['end_at'],
758
- absolute=True),
759
- job['resources'],
760
- job['status'].colored_str(),
761
- job['log_path'],
762
- ])
763
- return job_table
764
-
765
-
766
931
  def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
767
932
  """Get the job queue in encoded json format.
768
933
 
@@ -794,31 +959,11 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
794
959
  for job in jobs:
795
960
  job['status'] = JobStatus(job['status'])
796
961
  job['user_hash'] = job['username']
797
- job['username'] = global_user_state.get_user(job['user_hash']).name
962
+ user = global_user_state.get_user(job['user_hash'])
963
+ job['username'] = user.name if user is not None else None
798
964
  return jobs
799
965
 
800
966
 
801
- # TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
802
- # after 0.10.0.
803
- def _create_ray_job_submission_client():
804
- """Import the ray job submission client."""
805
- try:
806
- import ray # pylint: disable=import-outside-toplevel
807
- except ImportError:
808
- logger.error('Failed to import ray')
809
- raise
810
- try:
811
- # pylint: disable=import-outside-toplevel
812
- from ray import job_submission
813
- except ImportError:
814
- logger.error(
815
- f'Failed to import job_submission with ray=={ray.__version__}')
816
- raise
817
- port = get_job_submission_port()
818
- return job_submission.JobSubmissionClient(
819
- address=f'http://127.0.0.1:{port}')
820
-
821
-
822
967
  def _make_ray_job_id(sky_job_id: int) -> str:
823
968
  return f'{sky_job_id}-{getpass.getuser()}'
824
969
 
@@ -838,6 +983,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
838
983
  Encoded job IDs that are actually cancelled. Caller should use
839
984
  message_utils.decode_payload() to parse.
840
985
  """
986
+ return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
987
+ user_hash))
988
+
989
+
990
+ def cancel_jobs(jobs: Optional[List[int]],
991
+ cancel_all: bool = False,
992
+ user_hash: Optional[str] = None) -> List[int]:
841
993
  job_records = []
842
994
  all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
843
995
  if jobs is None and not cancel_all:
@@ -880,18 +1032,6 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
880
1032
  # We don't have to start a daemon to forcefully kill the process
881
1033
  # as our job driver process will clean up the underlying
882
1034
  # child processes.
883
- elif job['pid'] < 0:
884
- try:
885
- # TODO(zhwu): Backward compatibility, remove after 0.10.0.
886
- # The job was submitted with ray job submit before #4318.
887
- job_client = _create_ray_job_submission_client()
888
- job_client.stop_job(_make_ray_job_id(job['job_id']))
889
- except RuntimeError as e:
890
- # If the request to the job server fails, we should not
891
- # set the job to CANCELLED.
892
- if 'does not exist' not in str(e):
893
- logger.warning(str(e))
894
- continue
895
1035
  # Get the job status again to avoid race condition.
896
1036
  job_status = get_status_no_lock(job['job_id'])
897
1037
  if job_status in [
@@ -901,36 +1041,55 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
901
1041
  cancelled_ids.append(job['job_id'])
902
1042
 
903
1043
  scheduler.schedule_step()
904
- return message_utils.encode_payload(cancelled_ids)
1044
+ return cancelled_ids
905
1045
 
906
1046
 
1047
+ @init_db
907
1048
  def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
908
1049
  """Returns the relative path to the log file for a job."""
909
- _CURSOR.execute(
1050
+ assert _DB is not None
1051
+ _DB.cursor.execute(
910
1052
  """\
911
1053
  SELECT * FROM jobs
912
1054
  WHERE job_id=(?)""", (job_id,))
913
- row = _CURSOR.fetchone()
1055
+ row = _DB.cursor.fetchone()
914
1056
  if row is None:
915
1057
  return None
916
1058
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
917
1059
  return run_timestamp
918
1060
 
919
1061
 
920
- def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
921
- """Returns the relative paths to the log files for job with globbing."""
1062
+ @init_db
1063
+ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1064
+ """Returns the relative paths to the log files for jobs with globbing,
1065
+ encoded."""
1066
+ job_to_dir = get_job_log_dirs(job_ids)
1067
+ job_to_dir_str: Dict[str, str] = {}
1068
+ for job_id, log_dir in job_to_dir.items():
1069
+ job_to_dir_str[str(job_id)] = log_dir
1070
+ return message_utils.encode_payload(job_to_dir_str)
1071
+
1072
+
1073
+ @init_db
1074
+ def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
1075
+ """Returns the relative paths to the log files for jobs with globbing."""
1076
+ assert _DB is not None
922
1077
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
923
- _CURSOR.execute(
1078
+ _DB.cursor.execute(
924
1079
  f"""\
925
1080
  SELECT * FROM jobs
926
1081
  WHERE {query_str}""", job_ids)
927
- rows = _CURSOR.fetchall()
928
- run_timestamps = {}
1082
+ rows = _DB.cursor.fetchall()
1083
+ job_to_dir: Dict[int, str] = {}
929
1084
  for row in rows:
930
1085
  job_id = row[JobInfoLoc.JOB_ID.value]
931
- run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
932
- run_timestamps[str(job_id)] = run_timestamp
933
- return message_utils.encode_payload(run_timestamps)
1086
+ if row[JobInfoLoc.LOG_PATH.value]:
1087
+ job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
1088
+ else:
1089
+ run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1090
+ job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1091
+ run_timestamp)
1092
+ return job_to_dir
934
1093
 
935
1094
 
936
1095
  class JobLibCodeGen:
@@ -951,7 +1110,7 @@ class JobLibCodeGen:
951
1110
 
952
1111
  @classmethod
953
1112
  def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
954
- resources_str: str) -> str:
1113
+ resources_str: str, metadata: str) -> str:
955
1114
  if job_name is None:
956
1115
  job_name = '-'
957
1116
  code = [
@@ -962,12 +1121,25 @@ class JobLibCodeGen:
962
1121
  '\nif int(constants.SKYLET_VERSION) < 9: '
963
1122
  'raise RuntimeError("SkyPilot runtime is too old, which does not '
964
1123
  'support submitting jobs.")',
965
- '\njob_id = job_lib.add_job('
1124
+ '\nresult = None',
1125
+ '\nif int(constants.SKYLET_VERSION) < 15: '
1126
+ '\n result = job_lib.add_job('
966
1127
  f'{job_name!r},'
967
1128
  f'{username!r},'
968
1129
  f'{run_timestamp!r},'
969
1130
  f'{resources_str!r})',
970
- 'print("Job ID: " + str(job_id), flush=True)',
1131
+ '\nelse: '
1132
+ '\n result = job_lib.add_job('
1133
+ f'{job_name!r},'
1134
+ f'{username!r},'
1135
+ f'{run_timestamp!r},'
1136
+ f'{resources_str!r},'
1137
+ f'metadata={metadata!r})',
1138
+ ('\nif isinstance(result, tuple):'
1139
+ '\n print("Job ID: " + str(result[0]), flush=True)'
1140
+ '\n print("Log Dir: " + str(result[1]), flush=True)'
1141
+ '\nelse:'
1142
+ '\n print("Job ID: " + str(result), flush=True)'),
971
1143
  ]
972
1144
  return cls._build(code)
973
1145
 
@@ -1036,17 +1208,24 @@ class JobLibCodeGen:
1036
1208
  # We use != instead of is not because 1 is not None will print a warning:
1037
1209
  # <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
1038
1210
  f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
1039
- 'run_timestamp = job_lib.get_run_timestamp(job_id)',
1040
- f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
1041
- f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1042
- f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1043
- f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
1211
+ # For backward compatibility, use the legacy generation rule for
1212
+ # jobs submitted before 0.11.0.
1213
+ ('log_dir = None\n'
1214
+ 'if hasattr(job_lib, "get_log_dir_for_job"):\n'
1215
+ ' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
1216
+ 'if log_dir is None:\n'
1217
+ ' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
1218
+ f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
1219
+ ),
1220
+ # Add a newline to leave the if indent block above.
1221
+ f'\nlog_lib.tail_logs(job_id=job_id, log_dir=log_dir, managed_job_id={managed_job_id!r}, follow={follow}, tail={tail})',
1044
1222
  # After tailing, check the job status and exit with appropriate code
1045
1223
  'job_status = job_lib.get_status(job_id)',
1046
- # Backward compatibility for returning exit code: Skylet versions 2
1047
- # and older did not have JobExitCode, so we use 0 for those versions
1048
- # TODO: Remove this special handling after 0.10.0.
1049
- 'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
1224
+ 'exit_code = exceptions.JobExitCode.from_job_status(job_status)',
1225
+ # Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
1226
+ # exit with success (0) since fetching current logs is a successful operation.
1227
+ # This prevents shell wrappers from printing "command terminated with exit code 101".
1228
+ f'exit_code = 0 if not {follow} and exit_code == 101 else exit_code',
1050
1229
  'sys.exit(exit_code)',
1051
1230
  ]
1052
1231
  return cls._build(code)
@@ -1078,12 +1257,14 @@ class JobLibCodeGen:
1078
1257
  return cls._build(code)
1079
1258
 
1080
1259
  @classmethod
1081
- def get_run_timestamp_with_globbing(cls,
1082
- job_ids: Optional[List[str]]) -> str:
1260
+ def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
1083
1261
  code = [
1084
1262
  f'job_ids = {job_ids} if {job_ids} is not None '
1085
1263
  'else [job_lib.get_latest_job_id()]',
1086
- 'log_dirs = job_lib.run_timestamp_with_globbing_payload(job_ids)',
1264
+ # TODO(aylei): backward compatibility, remove after 0.12.0.
1265
+ 'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
1266
+ 'hasattr(job_lib, "get_log_dir_for_jobs") else '
1267
+ 'job_lib.run_timestamp_with_globbing_payload(job_ids)',
1087
1268
  'print(log_dirs, flush=True)',
1088
1269
  ]
1089
1270
  return cls._build(code)