skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py CHANGED
@@ -3,6 +3,7 @@
3
3
  This is a remote utility module that provides job queue functionality.
4
4
  """
5
5
  import enum
6
+ import functools
6
7
  import getpass
7
8
  import json
8
9
  import os
@@ -10,9 +11,10 @@ import pathlib
10
11
  import shlex
11
12
  import signal
12
13
  import sqlite3
14
+ import threading
13
15
  import time
14
16
  import typing
15
- from typing import Any, Dict, List, Optional, Sequence
17
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
16
18
 
17
19
  import colorama
18
20
  import filelock
@@ -22,15 +24,17 @@ from sky import sky_logging
22
24
  from sky.adaptors import common as adaptors_common
23
25
  from sky.skylet import constants
24
26
  from sky.utils import common_utils
25
- from sky.utils import db_utils
26
- from sky.utils import log_utils
27
27
  from sky.utils import message_utils
28
28
  from sky.utils import subprocess_utils
29
+ from sky.utils.db import db_utils
29
30
 
30
31
  if typing.TYPE_CHECKING:
31
32
  import psutil
33
+
34
+ from sky.schemas.generated import jobsv1_pb2
32
35
  else:
33
36
  psutil = adaptors_common.LazyImport('psutil')
37
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
34
38
 
35
39
  logger = sky_logging.init_logger(__name__)
36
40
 
@@ -60,10 +64,8 @@ class JobInfoLoc(enum.IntEnum):
60
64
  END_AT = 7
61
65
  RESOURCES = 8
62
66
  PID = 9
63
-
64
-
65
- _DB_PATH = os.path.expanduser('~/.sky/jobs.db')
66
- os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
67
+ LOG_PATH = 10
68
+ METADATA = 11
67
69
 
68
70
 
69
71
  def create_table(cursor, conn):
@@ -103,7 +105,9 @@ def create_table(cursor, conn):
103
105
  start_at FLOAT DEFAULT -1,
104
106
  end_at FLOAT DEFAULT NULL,
105
107
  resources TEXT DEFAULT NULL,
106
- pid INTEGER DEFAULT -1)""")
108
+ pid INTEGER DEFAULT -1,
109
+ log_dir TEXT DEFAULT NULL,
110
+ metadata TEXT DEFAULT '{}')""")
107
111
 
108
112
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
109
113
  job_id INTEGER,
@@ -116,12 +120,38 @@ def create_table(cursor, conn):
116
120
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
117
121
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
118
122
  'INTEGER DEFAULT -1')
123
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
124
+ 'TEXT DEFAULT NULL')
125
+ db_utils.add_column_to_table(cursor,
126
+ conn,
127
+ 'jobs',
128
+ 'metadata',
129
+ 'TEXT DEFAULT \'{}\'',
130
+ value_to_replace_existing_entries='{}')
119
131
  conn.commit()
120
132
 
121
133
 
122
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
123
- _CURSOR = _DB.cursor
124
- _CONN = _DB.conn
134
+ _DB = None
135
+ _db_init_lock = threading.Lock()
136
+
137
+
138
+ def init_db(func):
139
+ """Initialize the database."""
140
+
141
+ @functools.wraps(func)
142
+ def wrapper(*args, **kwargs):
143
+ global _DB
144
+ if _DB is not None:
145
+ return func(*args, **kwargs)
146
+
147
+ with _db_init_lock:
148
+ if _DB is None:
149
+ db_path = os.path.expanduser('~/.sky/jobs.db')
150
+ os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
151
+ _DB = db_utils.SQLiteConn(db_path, create_table)
152
+ return func(*args, **kwargs)
153
+
154
+ return wrapper
125
155
 
126
156
 
127
157
  class JobStatus(enum.Enum):
@@ -192,6 +222,45 @@ class JobStatus(enum.Enum):
192
222
  color = _JOB_STATUS_TO_COLOR[self]
193
223
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
194
224
 
225
+ @classmethod
226
+ def from_protobuf(
227
+ cls,
228
+ protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
229
+ """Convert protobuf JobStatus enum to Python enum value."""
230
+ protobuf_to_enum = {
231
+ jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
232
+ jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
233
+ jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
234
+ jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
235
+ jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
236
+ jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
237
+ jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
238
+ jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
239
+ jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
240
+ jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
241
+ }
242
+ if protobuf_value not in protobuf_to_enum:
243
+ raise ValueError(
244
+ f'Unknown protobuf JobStatus value: {protobuf_value}')
245
+ return protobuf_to_enum[protobuf_value]
246
+
247
+ def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
248
+ """Convert this Python enum value to protobuf enum value."""
249
+ enum_to_protobuf = {
250
+ JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
251
+ JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
252
+ JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
253
+ JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
254
+ JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
255
+ JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
256
+ JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
257
+ JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
258
+ JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
259
+ }
260
+ if self not in enum_to_protobuf:
261
+ raise ValueError(f'Unknown JobStatus value: {self}')
262
+ return enum_to_protobuf[self]
263
+
195
264
 
196
265
  # We have two steps for job submissions:
197
266
  # 1. Client reserve a job id from the job table by adding a INIT state job.
@@ -210,30 +279,37 @@ _PRE_RESOURCE_STATUSES = [JobStatus.PENDING]
210
279
  class JobScheduler:
211
280
  """Base class for job scheduler"""
212
281
 
282
+ @init_db
213
283
  def queue(self, job_id: int, cmd: str) -> None:
214
- _CURSOR.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
215
- (job_id, cmd, 0, int(time.time())))
216
- _CONN.commit()
284
+ assert _DB is not None
285
+ _DB.cursor.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
286
+ (job_id, cmd, 0, int(time.time())))
287
+ _DB.conn.commit()
217
288
  set_status(job_id, JobStatus.PENDING)
218
289
  self.schedule_step()
219
290
 
291
+ @init_db
220
292
  def remove_job_no_lock(self, job_id: int) -> None:
221
- _CURSOR.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
222
- _CONN.commit()
293
+ assert _DB is not None
294
+ _DB.cursor.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
295
+ _DB.conn.commit()
223
296
 
297
+ @init_db
224
298
  def _run_job(self, job_id: int, run_cmd: str):
225
- _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} '
226
- f'WHERE job_id={job_id!r}'))
227
- _CONN.commit()
299
+ assert _DB is not None
300
+ _DB.cursor.execute(
301
+ (f'UPDATE pending_jobs SET submit={int(time.time())} '
302
+ f'WHERE job_id={job_id!r}'))
303
+ _DB.conn.commit()
228
304
  pid = subprocess_utils.launch_new_process_tree(run_cmd)
229
305
  # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
230
306
  # This is for the case where the job is submitted with SkyPilot older
231
307
  # than #4318, using ray job submit.
232
308
  if 'job submit' in run_cmd:
233
309
  pid = -1
234
- _CURSOR.execute((f'UPDATE jobs SET pid={pid} '
235
- f'WHERE job_id={job_id!r}'))
236
- _CONN.commit()
310
+ _DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
311
+ f'WHERE job_id={job_id!r}'))
312
+ _DB.conn.commit()
237
313
 
238
314
  def schedule_step(self, force_update_jobs: bool = False) -> None:
239
315
  if force_update_jobs:
@@ -282,8 +358,10 @@ class JobScheduler:
282
358
  class FIFOScheduler(JobScheduler):
283
359
  """First in first out job scheduler"""
284
360
 
361
+ @init_db
285
362
  def _get_pending_job_ids(self) -> List[int]:
286
- rows = _CURSOR.execute(
363
+ assert _DB is not None
364
+ rows = _DB.cursor.execute(
287
365
  'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
288
366
  return [row[0] for row in rows]
289
367
 
@@ -308,26 +386,67 @@ def make_job_command_with_user_switching(username: str,
308
386
  return ['sudo', '-H', 'su', '--login', username, '-c', command]
309
387
 
310
388
 
311
- def add_job(job_name: str, username: str, run_timestamp: str,
312
- resources_str: str) -> int:
389
+ @init_db
390
+ def add_job(job_name: str,
391
+ username: str,
392
+ run_timestamp: str,
393
+ resources_str: str,
394
+ metadata: str = '{}') -> Tuple[int, str]:
313
395
  """Atomically reserve the next available job id for the user."""
396
+ assert _DB is not None
314
397
  job_submitted_at = time.time()
315
398
  # job_id will autoincrement with the null value
316
- _CURSOR.execute(
317
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
399
+ _DB.cursor.execute(
400
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
318
401
  (job_name, username, job_submitted_at, JobStatus.INIT.value,
319
- run_timestamp, None, resources_str))
320
- _CONN.commit()
321
- rows = _CURSOR.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
322
- (run_timestamp,))
402
+ run_timestamp, None, resources_str, metadata))
403
+ _DB.conn.commit()
404
+ rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
405
+ (run_timestamp,))
323
406
  for row in rows:
324
407
  job_id = row[0]
325
408
  assert job_id is not None
326
- return job_id
409
+ log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
410
+ set_log_dir_no_lock(job_id, log_dir)
411
+ return job_id, log_dir
412
+
413
+
414
+ @init_db
415
+ def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
416
+ """Set the log directory for the job.
417
+
418
+ We persist the log directory for the job to allow changing the log directory
419
+ generation logic over versions.
420
+
421
+ Args:
422
+ job_id: The ID of the job.
423
+ log_dir: The log directory for the job.
424
+ """
425
+ assert _DB is not None
426
+ _DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
427
+ (log_dir, job_id))
428
+ _DB.conn.commit()
429
+
430
+
431
+ @init_db
432
+ def get_log_dir_for_job(job_id: int) -> Optional[str]:
433
+ """Get the log directory for the job.
434
+
435
+ Args:
436
+ job_id: The ID of the job.
437
+ """
438
+ assert _DB is not None
439
+ rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
440
+ (job_id,))
441
+ for row in rows:
442
+ return row[0]
443
+ return None
327
444
 
328
445
 
446
+ @init_db
329
447
  def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
330
448
  """Setting the status of the job in the database."""
449
+ assert _DB is not None
331
450
  assert status != JobStatus.RUNNING, (
332
451
  'Please use set_job_started() to set job status to RUNNING')
333
452
  if status.is_terminal():
@@ -339,15 +458,15 @@ def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
339
458
  check_end_at_str = ' AND end_at IS NULL'
340
459
  if status != JobStatus.FAILED_SETUP:
341
460
  check_end_at_str = ''
342
- _CURSOR.execute(
461
+ _DB.cursor.execute(
343
462
  'UPDATE jobs SET status=(?), end_at=(?) '
344
463
  f'WHERE job_id=(?) {check_end_at_str}',
345
464
  (status.value, end_at, job_id))
346
465
  else:
347
- _CURSOR.execute(
466
+ _DB.cursor.execute(
348
467
  'UPDATE jobs SET status=(?), end_at=NULL '
349
468
  'WHERE job_id=(?)', (status.value, job_id))
350
- _CONN.commit()
469
+ _DB.conn.commit()
351
470
 
352
471
 
353
472
  def set_status(job_id: int, status: JobStatus) -> None:
@@ -357,16 +476,19 @@ def set_status(job_id: int, status: JobStatus) -> None:
357
476
  _set_status_no_lock(job_id, status)
358
477
 
359
478
 
479
+ @init_db
360
480
  def set_job_started(job_id: int) -> None:
361
481
  # TODO(mraheja): remove pylint disabling when filelock version updated.
362
482
  # pylint: disable=abstract-class-instantiated
483
+ assert _DB is not None
363
484
  with filelock.FileLock(_get_lock_path(job_id)):
364
- _CURSOR.execute(
485
+ _DB.cursor.execute(
365
486
  'UPDATE jobs SET status=(?), start_at=(?), end_at=NULL '
366
487
  'WHERE job_id=(?)', (JobStatus.RUNNING.value, time.time(), job_id))
367
- _CONN.commit()
488
+ _DB.conn.commit()
368
489
 
369
490
 
491
+ @init_db
370
492
  def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
371
493
  """Get the status of the job with the given id.
372
494
 
@@ -375,8 +497,9 @@ def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
375
497
  the status in a while loop as in `log_lib._follow_job_logs`. Otherwise, use
376
498
  `get_status`.
377
499
  """
378
- rows = _CURSOR.execute('SELECT status FROM jobs WHERE job_id=(?)',
379
- (job_id,))
500
+ assert _DB is not None
501
+ rows = _DB.cursor.execute('SELECT status FROM jobs WHERE job_id=(?)',
502
+ (job_id,))
380
503
  for (status,) in rows:
381
504
  if status is None:
382
505
  return None
@@ -391,17 +514,65 @@ def get_status(job_id: int) -> Optional[JobStatus]:
391
514
  return get_status_no_lock(job_id)
392
515
 
393
516
 
517
+ @init_db
394
518
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
519
+ return message_utils.encode_payload(get_statuses(job_ids))
520
+
521
+
522
+ @init_db
523
+ def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
524
+ assert _DB is not None
395
525
  # Per-job lock is not required here, since the staled job status will not
396
526
  # affect the caller.
397
527
  query_str = ','.join(['?'] * len(job_ids))
398
- rows = _CURSOR.execute(
528
+ rows = _DB.cursor.execute(
399
529
  f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
400
530
  job_ids)
401
- statuses = {job_id: None for job_id in job_ids}
531
+ statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
402
532
  for (job_id, status) in rows:
403
533
  statuses[job_id] = status
404
- return message_utils.encode_payload(statuses)
534
+ return statuses
535
+
536
+
537
+ @init_db
538
+ def get_jobs_info(user_hash: Optional[str] = None,
539
+ all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
540
+ """Get detailed job information.
541
+
542
+ Similar to dump_job_queue but returns structured protobuf objects instead
543
+ of encoded strings.
544
+
545
+ Args:
546
+ user_hash: The user hash to show jobs for. Show all the users if None.
547
+ all_jobs: Whether to show all jobs, not just the pending/running ones.
548
+ """
549
+ assert _DB is not None
550
+
551
+ status_list: Optional[List[JobStatus]] = [
552
+ JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
553
+ ]
554
+ if all_jobs:
555
+ status_list = None
556
+
557
+ jobs = _get_jobs(user_hash, status_list=status_list)
558
+ jobs_info = []
559
+ for job in jobs:
560
+ jobs_info.append(
561
+ jobsv1_pb2.JobInfo(job_id=job['job_id'],
562
+ job_name=job['job_name'],
563
+ username=job['username'],
564
+ submitted_at=job['submitted_at'],
565
+ status=job['status'].to_protobuf(),
566
+ run_timestamp=job['run_timestamp'],
567
+ start_at=job['start_at'],
568
+ end_at=job['end_at'],
569
+ resources=job['resources'],
570
+ pid=job['pid'],
571
+ log_path=os.path.join(
572
+ constants.SKY_LOGS_DIRECTORY,
573
+ job['run_timestamp']),
574
+ metadata=json.dumps(job['metadata'])))
575
+ return jobs_info
405
576
 
406
577
 
407
578
  def load_statuses_payload(
@@ -419,14 +590,17 @@ def load_statuses_payload(
419
590
  return statuses
420
591
 
421
592
 
593
+ @init_db
422
594
  def get_latest_job_id() -> Optional[int]:
423
- rows = _CURSOR.execute(
595
+ assert _DB is not None
596
+ rows = _DB.cursor.execute(
424
597
  'SELECT job_id FROM jobs ORDER BY job_id DESC LIMIT 1')
425
598
  for (job_id,) in rows:
426
599
  return job_id
427
600
  return None
428
601
 
429
602
 
603
+ @init_db
430
604
  def get_job_submitted_or_ended_timestamp_payload(job_id: int,
431
605
  get_ended_time: bool) -> str:
432
606
  """Get the job submitted/ended timestamp.
@@ -437,15 +611,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
437
611
  PENDING state.
438
612
 
439
613
  The normal job duration will use `start_at` instead of `submitted_at` (in
440
- `format_job_queue()`), because the job may stay in PENDING if the cluster is
441
- busy.
614
+ `table_utils.format_job_queue()`), because the job may stay in PENDING if
615
+ the cluster is busy.
616
+ """
617
+ return message_utils.encode_payload(
618
+ get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
619
+
620
+
621
+ @init_db
622
+ def get_job_submitted_or_ended_timestamp(
623
+ job_id: int, get_ended_time: bool) -> Optional[float]:
624
+ """Get the job submitted timestamp.
625
+
626
+ Returns the raw timestamp or None if job doesn't exist.
442
627
  """
628
+ assert _DB is not None
443
629
  field = 'end_at' if get_ended_time else 'submitted_at'
444
- rows = _CURSOR.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
445
- (job_id,))
630
+ rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
631
+ (job_id,))
446
632
  for (timestamp,) in rows:
447
- return message_utils.encode_payload(timestamp)
448
- return message_utils.encode_payload(None)
633
+ return timestamp
634
+ return None
449
635
 
450
636
 
451
637
  def get_ray_port():
@@ -492,14 +678,17 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
492
678
  'end_at': row[JobInfoLoc.END_AT.value],
493
679
  'resources': row[JobInfoLoc.RESOURCES.value],
494
680
  'pid': row[JobInfoLoc.PID.value],
681
+ 'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
495
682
  })
496
683
  return records
497
684
 
498
685
 
686
+ @init_db
499
687
  def _get_jobs(
500
688
  user_hash: Optional[str],
501
689
  status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]:
502
690
  """Returns jobs with the given fields, sorted by job_id, descending."""
691
+ assert _DB is not None
503
692
  if status_list is None:
504
693
  status_list = list(JobStatus)
505
694
  status_str_list = [repr(status.value) for status in status_list]
@@ -509,14 +698,16 @@ def _get_jobs(
509
698
  # We use the old username field for compatibility.
510
699
  filter_str += ' AND username=(?)'
511
700
  params.append(user_hash)
512
- rows = _CURSOR.execute(
701
+ rows = _DB.cursor.execute(
513
702
  f'SELECT * FROM jobs {filter_str} ORDER BY job_id DESC', params)
514
703
  records = _get_records_from_rows(rows)
515
704
  return records
516
705
 
517
706
 
707
+ @init_db
518
708
  def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
519
- rows = _CURSOR.execute(
709
+ assert _DB is not None
710
+ rows = _DB.cursor.execute(
520
711
  f"""\
521
712
  SELECT * FROM jobs
522
713
  WHERE job_id IN ({','.join(['?'] * len(job_ids))})
@@ -527,8 +718,10 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
527
718
  return records
528
719
 
529
720
 
721
+ @init_db
530
722
  def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
531
- rows = _CURSOR.execute(
723
+ assert _DB is not None
724
+ rows = _DB.cursor.execute(
532
725
  'SELECT created_time, submit, run_cmd FROM pending_jobs '
533
726
  f'WHERE job_id={job_id!r}')
534
727
  for row in rows:
@@ -698,19 +891,29 @@ def update_job_status(job_ids: List[int],
698
891
  return statuses
699
892
 
700
893
 
894
+ @init_db
701
895
  def fail_all_jobs_in_progress() -> None:
896
+ assert _DB is not None
702
897
  in_progress_status = [
703
898
  status.value for status in JobStatus.nonterminal_statuses()
704
899
  ]
705
- _CURSOR.execute(
900
+ _DB.cursor.execute(
706
901
  f"""\
707
902
  UPDATE jobs SET status=(?)
708
903
  WHERE status IN ({','.join(['?'] * len(in_progress_status))})
709
904
  """, (JobStatus.FAILED_DRIVER.value, *in_progress_status))
710
- _CONN.commit()
905
+ _DB.conn.commit()
711
906
 
712
907
 
713
908
  def update_status() -> None:
909
+ # This signal file suggests that the controller is recovering from a
910
+ # failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
911
+ # details. When recovering, we should not update the job status to failed
912
+ # driver as they will be recovered later.
913
+ if os.path.exists(
914
+ os.path.expanduser(
915
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
916
+ return
714
917
  # This will be called periodically by the skylet to update the status
715
918
  # of the jobs in the database, to avoid stale job status.
716
919
  nonterminal_jobs = _get_jobs(user_hash=None,
@@ -720,12 +923,14 @@ def update_status() -> None:
720
923
  update_job_status(nonterminal_job_ids)
721
924
 
722
925
 
926
+ @init_db
723
927
  def is_cluster_idle() -> bool:
724
928
  """Returns if the cluster is idle (no in-flight jobs)."""
929
+ assert _DB is not None
725
930
  in_progress_status = [
726
931
  status.value for status in JobStatus.nonterminal_statuses()
727
932
  ]
728
- rows = _CURSOR.execute(
933
+ rows = _DB.cursor.execute(
729
934
  f"""\
730
935
  SELECT COUNT(*) FROM jobs
731
936
  WHERE status IN ({','.join(['?'] * len(in_progress_status))})
@@ -735,34 +940,6 @@ def is_cluster_idle() -> bool:
735
940
  assert False, 'Should not reach here'
736
941
 
737
942
 
738
- def format_job_queue(jobs: List[Dict[str, Any]]):
739
- """Format the job queue for display.
740
-
741
- Usage:
742
- jobs = get_job_queue()
743
- print(format_job_queue(jobs))
744
- """
745
- job_table = log_utils.create_table([
746
- 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
747
- 'STATUS', 'LOG'
748
- ])
749
- for job in jobs:
750
- job_table.add_row([
751
- job['job_id'],
752
- job['job_name'],
753
- job['username'],
754
- log_utils.readable_time_duration(job['submitted_at']),
755
- log_utils.readable_time_duration(job['start_at']),
756
- log_utils.readable_time_duration(job['start_at'],
757
- job['end_at'],
758
- absolute=True),
759
- job['resources'],
760
- job['status'].colored_str(),
761
- job['log_path'],
762
- ])
763
- return job_table
764
-
765
-
766
943
  def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
767
944
  """Get the job queue in encoded json format.
768
945
 
@@ -794,7 +971,8 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
794
971
  for job in jobs:
795
972
  job['status'] = JobStatus(job['status'])
796
973
  job['user_hash'] = job['username']
797
- job['username'] = global_user_state.get_user(job['user_hash']).name
974
+ user = global_user_state.get_user(job['user_hash'])
975
+ job['username'] = user.name if user is not None else None
798
976
  return jobs
799
977
 
800
978
 
@@ -838,6 +1016,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
838
1016
  Encoded job IDs that are actually cancelled. Caller should use
839
1017
  message_utils.decode_payload() to parse.
840
1018
  """
1019
+ return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
1020
+ user_hash))
1021
+
1022
+
1023
+ def cancel_jobs(jobs: Optional[List[int]],
1024
+ cancel_all: bool = False,
1025
+ user_hash: Optional[str] = None) -> List[int]:
841
1026
  job_records = []
842
1027
  all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
843
1028
  if jobs is None and not cancel_all:
@@ -901,36 +1086,55 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
901
1086
  cancelled_ids.append(job['job_id'])
902
1087
 
903
1088
  scheduler.schedule_step()
904
- return message_utils.encode_payload(cancelled_ids)
1089
+ return cancelled_ids
905
1090
 
906
1091
 
1092
+ @init_db
907
1093
  def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
908
1094
  """Returns the relative path to the log file for a job."""
909
- _CURSOR.execute(
1095
+ assert _DB is not None
1096
+ _DB.cursor.execute(
910
1097
  """\
911
1098
  SELECT * FROM jobs
912
1099
  WHERE job_id=(?)""", (job_id,))
913
- row = _CURSOR.fetchone()
1100
+ row = _DB.cursor.fetchone()
914
1101
  if row is None:
915
1102
  return None
916
1103
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
917
1104
  return run_timestamp
918
1105
 
919
1106
 
920
- def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
921
- """Returns the relative paths to the log files for job with globbing."""
1107
+ @init_db
1108
+ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1109
+ """Returns the relative paths to the log files for jobs with globbing,
1110
+ encoded."""
1111
+ job_to_dir = get_job_log_dirs(job_ids)
1112
+ job_to_dir_str: Dict[str, str] = {}
1113
+ for job_id, log_dir in job_to_dir.items():
1114
+ job_to_dir_str[str(job_id)] = log_dir
1115
+ return message_utils.encode_payload(job_to_dir_str)
1116
+
1117
+
1118
+ @init_db
1119
+ def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
1120
+ """Returns the relative paths to the log files for jobs with globbing."""
1121
+ assert _DB is not None
922
1122
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
923
- _CURSOR.execute(
1123
+ _DB.cursor.execute(
924
1124
  f"""\
925
1125
  SELECT * FROM jobs
926
1126
  WHERE {query_str}""", job_ids)
927
- rows = _CURSOR.fetchall()
928
- run_timestamps = {}
1127
+ rows = _DB.cursor.fetchall()
1128
+ job_to_dir: Dict[int, str] = {}
929
1129
  for row in rows:
930
1130
  job_id = row[JobInfoLoc.JOB_ID.value]
931
- run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
932
- run_timestamps[str(job_id)] = run_timestamp
933
- return message_utils.encode_payload(run_timestamps)
1131
+ if row[JobInfoLoc.LOG_PATH.value]:
1132
+ job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
1133
+ else:
1134
+ run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1135
+ job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1136
+ run_timestamp)
1137
+ return job_to_dir
934
1138
 
935
1139
 
936
1140
  class JobLibCodeGen:
@@ -951,7 +1155,7 @@ class JobLibCodeGen:
951
1155
 
952
1156
  @classmethod
953
1157
  def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
954
- resources_str: str) -> str:
1158
+ resources_str: str, metadata: str) -> str:
955
1159
  if job_name is None:
956
1160
  job_name = '-'
957
1161
  code = [
@@ -962,12 +1166,25 @@ class JobLibCodeGen:
962
1166
  '\nif int(constants.SKYLET_VERSION) < 9: '
963
1167
  'raise RuntimeError("SkyPilot runtime is too old, which does not '
964
1168
  'support submitting jobs.")',
965
- '\njob_id = job_lib.add_job('
1169
+ '\nresult = None',
1170
+ '\nif int(constants.SKYLET_VERSION) < 15: '
1171
+ '\n result = job_lib.add_job('
966
1172
  f'{job_name!r},'
967
1173
  f'{username!r},'
968
1174
  f'{run_timestamp!r},'
969
1175
  f'{resources_str!r})',
970
- 'print("Job ID: " + str(job_id), flush=True)',
1176
+ '\nelse: '
1177
+ '\n result = job_lib.add_job('
1178
+ f'{job_name!r},'
1179
+ f'{username!r},'
1180
+ f'{run_timestamp!r},'
1181
+ f'{resources_str!r},'
1182
+ f'metadata={metadata!r})',
1183
+ ('\nif isinstance(result, tuple):'
1184
+ '\n print("Job ID: " + str(result[0]), flush=True)'
1185
+ '\n print("Log Dir: " + str(result[1]), flush=True)'
1186
+ '\nelse:'
1187
+ '\n print("Job ID: " + str(result), flush=True)'),
971
1188
  ]
972
1189
  return cls._build(code)
973
1190
 
@@ -1036,9 +1253,17 @@ class JobLibCodeGen:
1036
1253
  # We use != instead of is not because 1 is not None will print a warning:
1037
1254
  # <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
1038
1255
  f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
1039
- 'run_timestamp = job_lib.get_run_timestamp(job_id)',
1040
- f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
1041
- f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1256
+ # For backward compatibility, use the legacy generation rule for
1257
+ # jobs submitted before 0.11.0.
1258
+ ('log_dir = None\n'
1259
+ 'if hasattr(job_lib, "get_log_dir_for_job"):\n'
1260
+ ' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
1261
+ 'if log_dir is None:\n'
1262
+ ' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
1263
+ f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
1264
+ ),
1265
+ # Add a newline to leave the if indent block above.
1266
+ f'\ntail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1042
1267
  f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1043
1268
  f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
1044
1269
  # After tailing, check the job status and exit with appropriate code
@@ -1047,6 +1272,10 @@ class JobLibCodeGen:
1047
1272
  # and older did not have JobExitCode, so we use 0 for those versions
1048
1273
  # TODO: Remove this special handling after 0.10.0.
1049
1274
  'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
1275
+ # Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
1276
+ # exit with success (0) since fetching current logs is a successful operation.
1277
+ # This prevents shell wrappers from printing "command terminated with exit code 101".
1278
+ f'exit_code = 0 if not {follow} and exit_code == 101 else exit_code',
1050
1279
  'sys.exit(exit_code)',
1051
1280
  ]
1052
1281
  return cls._build(code)
@@ -1078,12 +1307,14 @@ class JobLibCodeGen:
1078
1307
  return cls._build(code)
1079
1308
 
1080
1309
  @classmethod
1081
- def get_run_timestamp_with_globbing(cls,
1082
- job_ids: Optional[List[str]]) -> str:
1310
+ def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
1083
1311
  code = [
1084
1312
  f'job_ids = {job_ids} if {job_ids} is not None '
1085
1313
  'else [job_lib.get_latest_job_id()]',
1086
- 'log_dirs = job_lib.run_timestamp_with_globbing_payload(job_ids)',
1314
+ # TODO(aylei): backward compatibility, remove after 0.12.0.
1315
+ 'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
1316
+ 'hasattr(job_lib, "get_log_dir_for_jobs") else '
1317
+ 'job_lib.run_timestamp_with_globbing_payload(job_ids)',
1087
1318
  'print(log_dirs, flush=True)',
1088
1319
  ]
1089
1320
  return cls._build(code)