skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,41 @@
1
+ --- a/worker.py
2
+ +++ b/worker.py
3
+ @@ -1,3 +1,7 @@
4
+ +# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
5
+ +# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
6
+ +# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
7
+ +
8
+ import atexit
9
+ import faulthandler
10
+ import functools
11
+ @@ -2020,6 +2024,14 @@
12
+ pid = data.get("pid")
13
+ lines = data.get("lines", [])
14
+
15
+ + def end_for(line: str) -> str:
16
+ + if sys.platform == "win32":
17
+ + return "\n"
18
+ + if line.endswith("\r"):
19
+ + return ""
20
+ + return "\n"
21
+ +
22
+ +
23
+ if data.get("ip") == data.get("localhost"):
24
+ for line in lines:
25
+ if RAY_TQDM_MAGIC in line:
26
+ @@ -2035,6 +2047,7 @@
27
+ message_for(data, line),
28
+ ),
29
+ file=print_file,
30
+ + end=end_for(line),
31
+ )
32
+ else:
33
+ for line in lines:
34
+ @@ -2052,6 +2065,7 @@
35
+ message_for(data, line),
36
+ ),
37
+ file=print_file,
38
+ + end=end_for(line),
39
+ )
40
+ # Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
41
+ restore_tqdm()
sky/skylet/services.py ADDED
@@ -0,0 +1,564 @@
1
+ """gRPC service implementations for skylet."""
2
+
3
+ import os
4
+ from typing import List, Optional
5
+
6
+ import grpc
7
+
8
+ from sky import exceptions
9
+ from sky import sky_logging
10
+ from sky.jobs import state as managed_job_state
11
+ from sky.jobs import utils as managed_job_utils
12
+ from sky.schemas.generated import autostopv1_pb2
13
+ from sky.schemas.generated import autostopv1_pb2_grpc
14
+ from sky.schemas.generated import jobsv1_pb2
15
+ from sky.schemas.generated import jobsv1_pb2_grpc
16
+ from sky.schemas.generated import managed_jobsv1_pb2
17
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
18
+ from sky.schemas.generated import servev1_pb2
19
+ from sky.schemas.generated import servev1_pb2_grpc
20
+ from sky.serve import serve_rpc_utils
21
+ from sky.serve import serve_state
22
+ from sky.serve import serve_utils
23
+ from sky.skylet import autostop_lib
24
+ from sky.skylet import constants
25
+ from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
27
+
28
+ logger = sky_logging.init_logger(__name__)
29
+
30
+ # In the worst case, flush the log buffer every 50ms,
31
+ # to ensure responsiveness.
32
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
33
+
34
+
35
+ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
36
+ """Implementation of the AutostopService gRPC service."""
37
+
38
+ def SetAutostop( # type: ignore[return]
39
+ self, request: autostopv1_pb2.SetAutostopRequest,
40
+ context: grpc.ServicerContext
41
+ ) -> autostopv1_pb2.SetAutostopResponse:
42
+ """Sets autostop configuration for the cluster."""
43
+ try:
44
+ wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
45
+ request.wait_for)
46
+ autostop_lib.set_autostop(
47
+ idle_minutes=request.idle_minutes,
48
+ backend=request.backend,
49
+ wait_for=wait_for if wait_for is not None else
50
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
51
+ down=request.down)
52
+ return autostopv1_pb2.SetAutostopResponse()
53
+ except Exception as e: # pylint: disable=broad-except
54
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
55
+
56
+ def IsAutostopping( # type: ignore[return]
57
+ self, request: autostopv1_pb2.IsAutostoppingRequest,
58
+ context: grpc.ServicerContext
59
+ ) -> autostopv1_pb2.IsAutostoppingResponse:
60
+ """Checks if the cluster is currently autostopping."""
61
+ try:
62
+ is_autostopping = autostop_lib.get_is_autostopping()
63
+ return autostopv1_pb2.IsAutostoppingResponse(
64
+ is_autostopping=is_autostopping)
65
+ except Exception as e: # pylint: disable=broad-except
66
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
67
+
68
+
69
+ class ServeServiceImpl(servev1_pb2_grpc.ServeServiceServicer):
70
+ """Implementation of the ServeService gRPC service."""
71
+
72
+ # NOTE (kyuds): this grpc service will run cluster-side,
73
+ # thus guaranteeing that SERVE_VERSION is above 5.
74
+ # Therefore, we removed some SERVE_VERSION checks
75
+ # present in the original codegen.
76
+
77
+ def GetServiceStatus( # type: ignore[return]
78
+ self, request: servev1_pb2.GetServiceStatusRequest,
79
+ context: grpc.ServicerContext
80
+ ) -> servev1_pb2.GetServiceStatusResponse:
81
+ """Gets serve status."""
82
+ try:
83
+ service_names, pool = (
84
+ serve_rpc_utils.GetServiceStatusRequestConverter.from_proto(request)) # pylint: disable=line-too-long
85
+ statuses = serve_utils.get_service_status_pickled(
86
+ service_names, pool)
87
+ return serve_rpc_utils.GetServiceStatusResponseConverter.to_proto(
88
+ statuses)
89
+ except Exception as e: # pylint: disable=broad-except
90
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
91
+
92
+ def AddVersion( # type: ignore[return]
93
+ self, request: servev1_pb2.AddVersionRequest,
94
+ context: grpc.ServicerContext) -> servev1_pb2.AddVersionResponse:
95
+ """Adds serve version"""
96
+ try:
97
+ service_name = request.service_name
98
+ version = serve_state.add_version(service_name)
99
+ return servev1_pb2.AddVersionResponse(version=version)
100
+ except Exception as e: # pylint: disable=broad-except
101
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
102
+
103
+ def TerminateServices( # type: ignore[return]
104
+ self, request: servev1_pb2.TerminateServicesRequest,
105
+ context: grpc.ServicerContext
106
+ ) -> servev1_pb2.TerminateServicesResponse:
107
+ """Terminates serve"""
108
+ try:
109
+ service_names, purge, pool = (
110
+ serve_rpc_utils.TerminateServicesRequestConverter.from_proto(request)) # pylint: disable=line-too-long
111
+ message = serve_utils.terminate_services(service_names, purge, pool)
112
+ return servev1_pb2.TerminateServicesResponse(message=message)
113
+ except Exception as e: # pylint: disable=broad-except
114
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
115
+
116
+ def TerminateReplica( # type: ignore[return]
117
+ self, request: servev1_pb2.TerminateReplicaRequest,
118
+ context: grpc.ServicerContext
119
+ ) -> servev1_pb2.TerminateReplicaResponse:
120
+ """Terminate replica"""
121
+ try:
122
+ service_name = request.service_name
123
+ replica_id = request.replica_id
124
+ purge = request.purge
125
+ message = serve_utils.terminate_replica(service_name, replica_id,
126
+ purge)
127
+ return servev1_pb2.TerminateReplicaResponse(message=message)
128
+ except Exception as e: # pylint: disable=broad-except
129
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
130
+
131
+ def WaitServiceRegistration( # type: ignore[return]
132
+ self, request: servev1_pb2.WaitServiceRegistrationRequest,
133
+ context: grpc.ServicerContext
134
+ ) -> servev1_pb2.WaitServiceRegistrationResponse:
135
+ """Wait for service to be registered"""
136
+ try:
137
+ service_name = request.service_name
138
+ job_id = request.job_id
139
+ pool = request.pool
140
+ encoded = serve_utils.wait_service_registration(
141
+ service_name, job_id, pool)
142
+ lb_port = serve_utils.load_service_initialization_result(encoded)
143
+ return servev1_pb2.WaitServiceRegistrationResponse(lb_port=lb_port)
144
+ except Exception as e: # pylint: disable=broad-except
145
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
146
+
147
+ def UpdateService( # type: ignore[return]
148
+ self, request: servev1_pb2.UpdateServiceRequest,
149
+ context: grpc.ServicerContext) -> servev1_pb2.UpdateServiceResponse:
150
+ """Update service"""
151
+ try:
152
+ service_name = request.service_name
153
+ version = request.version
154
+ mode = request.mode
155
+ pool = request.pool
156
+ serve_utils.update_service_encoded(service_name, version, mode,
157
+ pool)
158
+ return servev1_pb2.UpdateServiceResponse()
159
+ except Exception as e: # pylint: disable=broad-except
160
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
161
+
162
+
163
+ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
164
+ """Implementation of the JobsService gRPC service."""
165
+
166
+ def AddJob( # type: ignore[return]
167
+ self, request: jobsv1_pb2.AddJobRequest,
168
+ context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
169
+ try:
170
+ job_name = request.job_name if request.HasField('job_name') else '-'
171
+ job_id, log_dir = job_lib.add_job(job_name, request.username,
172
+ request.run_timestamp,
173
+ request.resources_str,
174
+ request.metadata)
175
+ return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
176
+ except Exception as e: # pylint: disable=broad-except
177
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
178
+
179
+ def QueueJob( # type: ignore[return]
180
+ self, request: jobsv1_pb2.QueueJobRequest,
181
+ context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
182
+ try:
183
+ job_id = request.job_id
184
+ # Create log directory and file
185
+ remote_log_dir = os.path.expanduser(request.remote_log_dir)
186
+ os.makedirs(remote_log_dir, exist_ok=True)
187
+ remote_log_path = os.path.join(remote_log_dir, 'run.log')
188
+ open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
189
+
190
+ script_path = os.path.expanduser(request.script_path)
191
+ os.makedirs(os.path.dirname(script_path), exist_ok=True)
192
+
193
+ # If `codegen` is not provided, assume script is already
194
+ # uploaded to `script_path` via rsync.
195
+ if request.HasField('codegen'):
196
+ with open(script_path, 'w', encoding='utf-8') as f:
197
+ f.write(request.codegen)
198
+ os.chmod(script_path, 0o755)
199
+
200
+ cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
201
+ job_submit_cmd = (
202
+ # JOB_CMD_IDENTIFIER is used for identifying the process
203
+ # retrieved with pid is the same driver process.
204
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
205
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
206
+ # Do not use &>, which is not POSIX and may not work.
207
+ # Note that the order of ">filename 2>&1" matters.
208
+ f' > {remote_log_path} 2>&1')
209
+ job_lib.scheduler.queue(job_id, job_submit_cmd)
210
+
211
+ if request.HasField('managed_job'):
212
+ managed_job = request.managed_job
213
+ pool = managed_job.pool if managed_job.HasField(
214
+ 'pool') else None
215
+ pool_hash = None
216
+ if pool is not None:
217
+ pool_hash = serve_state.get_service_hash(pool)
218
+ # Add the managed job to job queue database.
219
+ user_id = managed_job.user_id if managed_job.HasField(
220
+ 'user_id') else None
221
+ managed_job_state.set_job_info(job_id, managed_job.name,
222
+ managed_job.workspace,
223
+ managed_job.entrypoint, pool,
224
+ pool_hash, user_id)
225
+ # Set the managed job to PENDING state to make sure that
226
+ # this managed job appears in the `sky jobs queue`, even
227
+ # if it needs to wait to be submitted.
228
+ # We cannot set the managed job to PENDING state in the
229
+ # job template (jobs-controller.yaml.j2), as it may need
230
+ # to wait for the run commands to be scheduled on the job
231
+ # controller in high-load cases.
232
+ for task in managed_job.tasks:
233
+ managed_job_state.set_pending(job_id, task.task_id,
234
+ task.name, task.resources_str,
235
+ task.metadata_json)
236
+ return jobsv1_pb2.QueueJobResponse()
237
+ except Exception as e: # pylint: disable=broad-except
238
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
239
+
240
+ def UpdateStatus( # type: ignore[return]
241
+ self, request: jobsv1_pb2.UpdateStatusRequest,
242
+ context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
243
+ try:
244
+ job_lib.update_status()
245
+ return jobsv1_pb2.UpdateStatusResponse()
246
+ except Exception as e: # pylint: disable=broad-except
247
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
248
+
249
+ def GetJobQueue( # type: ignore[return]
250
+ self, request: jobsv1_pb2.GetJobQueueRequest,
251
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
252
+ try:
253
+ user_hash = request.user_hash if request.HasField(
254
+ 'user_hash') else None
255
+ all_jobs = request.all_jobs
256
+ jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
257
+ all_jobs=all_jobs)
258
+ return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
259
+ except Exception as e: # pylint: disable=broad-except
260
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
261
+
262
+ def CancelJobs( # type: ignore[return]
263
+ self, request: jobsv1_pb2.CancelJobsRequest,
264
+ context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
265
+ try:
266
+ job_ids = list(request.job_ids) if request.job_ids else []
267
+ user_hash = request.user_hash if request.HasField(
268
+ 'user_hash') else None
269
+ cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
270
+ user_hash)
271
+ return jobsv1_pb2.CancelJobsResponse(
272
+ cancelled_job_ids=cancelled_job_ids)
273
+ except Exception as e: # pylint: disable=broad-except
274
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
275
+
276
+ def FailAllInProgressJobs( # type: ignore[return]
277
+ self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
278
+ context: grpc.ServicerContext
279
+ ) -> jobsv1_pb2.FailAllInProgressJobsResponse:
280
+ try:
281
+ job_lib.fail_all_jobs_in_progress()
282
+ return jobsv1_pb2.FailAllInProgressJobsResponse()
283
+ except Exception as e: # pylint: disable=broad-except
284
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
285
+
286
+ def TailLogs(
287
+ self,
288
+ request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
289
+ context: grpc.ServicerContext):
290
+ buffer = log_lib.LogBuffer()
291
+ try:
292
+ job_id = request.job_id if request.HasField(
293
+ 'job_id') else job_lib.get_latest_job_id()
294
+ managed_job_id = request.managed_job_id if request.HasField(
295
+ 'managed_job_id') else None
296
+ log_dir = job_lib.get_log_dir_for_job(job_id)
297
+ if log_dir is None:
298
+ run_timestamp = job_lib.get_run_timestamp(job_id)
299
+ log_dir = None if run_timestamp is None else os.path.join(
300
+ constants.SKY_LOGS_DIRECTORY, run_timestamp)
301
+
302
+ for line in log_lib.buffered_iter_with_timeout(
303
+ buffer,
304
+ log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
305
+ request.follow, request.tail),
306
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
307
+ yield jobsv1_pb2.TailLogsResponse(log_line=line)
308
+
309
+ job_status = job_lib.get_status(job_id)
310
+ exit_code = exceptions.JobExitCode.from_job_status(job_status)
311
+ # Fix for dashboard: When follow=False and job is still running
312
+ # (NOT_FINISHED=101), exit with success (0) since fetching current
313
+ # logs is a successful operation.
314
+ # This prevents shell wrappers from printing "command terminated
315
+ # with exit code 101".
316
+ exit_code_int = 0 if not request.follow and int(
317
+ exit_code) == 101 else int(exit_code)
318
+ yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
319
+ except Exception as e: # pylint: disable=broad-except
320
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
321
+ finally:
322
+ buffer.close()
323
+
324
+ def GetJobStatus( # type: ignore[return]
325
+ self, request: jobsv1_pb2.GetJobStatusRequest,
326
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
327
+ try:
328
+ if request.job_ids:
329
+ job_ids = list(request.job_ids)
330
+ else:
331
+ latest_job_id = job_lib.get_latest_job_id()
332
+ job_ids = [latest_job_id] if latest_job_id is not None else []
333
+ job_statuses = job_lib.get_statuses(job_ids)
334
+ for job_id, status in job_statuses.items():
335
+ job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
336
+ ) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
337
+ return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
338
+ except Exception as e: # pylint: disable=broad-except
339
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
340
+
341
+ def GetJobSubmittedTimestamp( # type: ignore[return]
342
+ self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
343
+ context: grpc.ServicerContext
344
+ ) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
345
+ try:
346
+ job_id = request.job_id if request.HasField(
347
+ 'job_id') else job_lib.get_latest_job_id()
348
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
349
+ job_id, False)
350
+ if timestamp is None:
351
+ context.abort(grpc.StatusCode.NOT_FOUND,
352
+ f'Job {job_id} not found')
353
+ return jobsv1_pb2.GetJobSubmittedTimestampResponse(
354
+ timestamp=timestamp)
355
+ except Exception as e: # pylint: disable=broad-except
356
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
357
+
358
+ def GetJobEndedTimestamp( # type: ignore[return]
359
+ self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
360
+ context: grpc.ServicerContext
361
+ ) -> jobsv1_pb2.GetJobEndedTimestampResponse:
362
+ try:
363
+ job_id = request.job_id if request.HasField(
364
+ 'job_id') else job_lib.get_latest_job_id()
365
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
366
+ job_id, True)
367
+ if timestamp is None:
368
+ context.abort(grpc.StatusCode.NOT_FOUND,
369
+ f'Job {job_id} not found or not ended')
370
+ return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
371
+ except Exception as e: # pylint: disable=broad-except
372
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
373
+
374
+ def GetLogDirsForJobs( # type: ignore[return]
375
+ self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
376
+ context: grpc.ServicerContext
377
+ ) -> jobsv1_pb2.GetLogDirsForJobsResponse:
378
+ try:
379
+ if request.job_ids:
380
+ job_ids = list(request.job_ids)
381
+ else:
382
+ latest_job_id = job_lib.get_latest_job_id()
383
+ job_ids = [latest_job_id] if latest_job_id is not None else []
384
+ job_log_dirs = job_lib.get_job_log_dirs(job_ids)
385
+ return jobsv1_pb2.GetLogDirsForJobsResponse(
386
+ job_log_dirs=job_log_dirs)
387
+ except Exception as e: # pylint: disable=broad-except
388
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
389
+
390
+
391
+ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
392
+ ):
393
+ """Implementation of the ManagedJobsService gRPC service."""
394
+
395
+ def GetVersion( # type: ignore[return]
396
+ self, request: managed_jobsv1_pb2.GetVersionRequest,
397
+ context: grpc.ServicerContext
398
+ ) -> managed_jobsv1_pb2.GetVersionResponse:
399
+ try:
400
+ return managed_jobsv1_pb2.GetVersionResponse(
401
+ controller_version=constants.SKYLET_VERSION)
402
+ except Exception as e: # pylint: disable=broad-except
403
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
404
+
405
+ def GetJobTable( # type: ignore[return]
406
+ self, request: managed_jobsv1_pb2.GetJobTableRequest,
407
+ context: grpc.ServicerContext
408
+ ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
+ try:
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
413
+ job_ids = (list(request.job_ids.ids)
414
+ if request.HasField('job_ids') else None)
415
+ user_hashes: Optional[List[Optional[str]]] = None
416
+ if request.HasField('user_hashes'):
417
+ user_hashes = list(request.user_hashes.hashes)
418
+ # For backwards compatibility, we show jobs that do not have a
419
+ # user_hash. TODO: Remove before 0.12.0.
420
+ if request.show_jobs_without_user_hash:
421
+ user_hashes.append(None)
422
+ statuses = (list(request.statuses.statuses)
423
+ if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
426
+ job_queue = managed_job_utils.get_managed_job_queue(
427
+ skip_finished=request.skip_finished,
428
+ accessible_workspaces=accessible_workspaces,
429
+ job_ids=job_ids,
430
+ workspace_match=request.workspace_match
431
+ if request.HasField('workspace_match') else None,
432
+ name_match=request.name_match
433
+ if request.HasField('name_match') else None,
434
+ pool_match=request.pool_match
435
+ if request.HasField('pool_match') else None,
436
+ page=request.page if request.HasField('page') else None,
437
+ limit=request.limit if request.HasField('limit') else None,
438
+ user_hashes=user_hashes,
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
442
+ jobs = job_queue['jobs']
443
+ total = job_queue['total']
444
+ total_no_filter = job_queue['total_no_filter']
445
+ status_counts = job_queue['status_counts']
446
+
447
+ jobs_info = []
448
+ for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
455
+ job_info = managed_jobsv1_pb2.ManagedJobInfo(
456
+ # The `spot.job_id`, which can be used to identify
457
+ # different tasks for the same job
458
+ _job_id=job.get('_job_id'),
459
+ job_id=job.get('job_id'),
460
+ task_id=job.get('task_id'),
461
+ job_name=job.get('job_name'),
462
+ task_name=job.get('task_name'),
463
+ job_duration=job.get('job_duration'),
464
+ workspace=job.get('workspace'),
465
+ status=managed_job_state.ManagedJobStatus(
466
+ job.get('status')).to_protobuf(),
467
+ schedule_state=managed_job_state.ManagedJobScheduleState(
468
+ job.get('schedule_state')).to_protobuf(),
469
+ resources=job.get('resources'),
470
+ cluster_resources=job.get('cluster_resources'),
471
+ cluster_resources_full=job.get('cluster_resources_full'),
472
+ cloud=job.get('cloud'),
473
+ region=job.get('region'),
474
+ infra=job.get('infra'),
475
+ accelerators=job.get('accelerators'),
476
+ recovery_count=job.get('recovery_count'),
477
+ details=job.get('details'),
478
+ failure_reason=job.get('failure_reason'),
479
+ user_name=job.get('user_name'),
480
+ user_hash=job.get('user_hash'),
481
+ submitted_at=job.get('submitted_at'),
482
+ start_at=job.get('start_at'),
483
+ end_at=job.get('end_at'),
484
+ user_yaml=job.get('user_yaml'),
485
+ entrypoint=job.get('entrypoint'),
486
+ metadata=converted_metadata,
487
+ pool=job.get('pool'),
488
+ pool_hash=job.get('pool_hash'))
489
+ jobs_info.append(job_info)
490
+
491
+ return managed_jobsv1_pb2.GetJobTableResponse(
492
+ jobs=jobs_info,
493
+ total=total,
494
+ total_no_filter=total_no_filter,
495
+ status_counts=status_counts)
496
+ except Exception as e: # pylint: disable=broad-except
497
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
498
+
499
+ def GetAllJobIdsByName( # type: ignore[return]
500
+ self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
501
+ context: grpc.ServicerContext
502
+ ) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
503
+ try:
504
+ job_name = request.job_name if request.HasField(
505
+ 'job_name') else None
506
+ job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
507
+ return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
508
+ job_ids=job_ids)
509
+ except Exception as e: # pylint: disable=broad-except
510
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
511
+
512
+ def CancelJobs( # type: ignore[return]
513
+ self, request: managed_jobsv1_pb2.CancelJobsRequest,
514
+ context: grpc.ServicerContext
515
+ ) -> managed_jobsv1_pb2.CancelJobsResponse:
516
+ try:
517
+ cancellation_criteria = request.WhichOneof('cancellation_criteria')
518
+ if cancellation_criteria is None:
519
+ context.abort(
520
+ grpc.StatusCode.INVALID_ARGUMENT,
521
+ 'exactly one cancellation criteria must be specified.')
522
+
523
+ if cancellation_criteria == 'all_users':
524
+ user_hash = request.user_hash if request.HasField(
525
+ 'user_hash') else None
526
+ all_users = request.all_users
527
+ if not all_users and user_hash is None:
528
+ context.abort(
529
+ grpc.StatusCode.INVALID_ARGUMENT,
530
+ 'user_hash is required when all_users is False')
531
+ message = managed_job_utils.cancel_jobs_by_id(
532
+ job_ids=None,
533
+ all_users=all_users,
534
+ current_workspace=request.current_workspace,
535
+ user_hash=user_hash)
536
+ elif cancellation_criteria == 'job_ids':
537
+ job_ids = list(request.job_ids.ids)
538
+ message = managed_job_utils.cancel_jobs_by_id(
539
+ job_ids=job_ids,
540
+ current_workspace=request.current_workspace)
541
+ elif cancellation_criteria == 'job_name':
542
+ message = managed_job_utils.cancel_job_by_name(
543
+ job_name=request.job_name,
544
+ current_workspace=request.current_workspace)
545
+ elif cancellation_criteria == 'pool_name':
546
+ message = managed_job_utils.cancel_jobs_by_pool(
547
+ pool_name=request.pool_name,
548
+ current_workspace=request.current_workspace)
549
+ else:
550
+ context.abort(
551
+ grpc.StatusCode.INVALID_ARGUMENT,
552
+ f'invalid cancellation criteria: {cancellation_criteria}')
553
+ return managed_jobsv1_pb2.CancelJobsResponse(message=message)
554
+ except Exception as e: # pylint: disable=broad-except
555
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
556
+
557
+ def StreamLogs(
558
+ self,
559
+ request: managed_jobsv1_pb2.
560
+ StreamLogsRequest, # type: ignore[return]
561
+ context: grpc.ServicerContext):
562
+ # TODO(kevin): implement this
563
+ context.abort(grpc.StatusCode.UNIMPLEMENTED,
564
+ 'StreamLogs is not implemented')