skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/services.py ADDED
@@ -0,0 +1,568 @@
1
+ """gRPC service implementations for skylet."""
2
+
3
+ import os
4
+ from typing import List, Optional
5
+
6
+ import grpc
7
+
8
+ from sky import exceptions
9
+ from sky import sky_logging
10
+ from sky.jobs import state as managed_job_state
11
+ from sky.jobs import utils as managed_job_utils
12
+ from sky.schemas.generated import autostopv1_pb2
13
+ from sky.schemas.generated import autostopv1_pb2_grpc
14
+ from sky.schemas.generated import jobsv1_pb2
15
+ from sky.schemas.generated import jobsv1_pb2_grpc
16
+ from sky.schemas.generated import managed_jobsv1_pb2
17
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
18
+ from sky.schemas.generated import servev1_pb2
19
+ from sky.schemas.generated import servev1_pb2_grpc
20
+ from sky.serve import serve_rpc_utils
21
+ from sky.serve import serve_state
22
+ from sky.serve import serve_utils
23
+ from sky.skylet import autostop_lib
24
+ from sky.skylet import constants
25
+ from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
27
+
28
+ logger = sky_logging.init_logger(__name__)
29
+
30
+ # In the worst case, flush the log buffer every 50ms,
31
+ # to ensure responsiveness.
32
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
33
+
34
+
35
+ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
36
+ """Implementation of the AutostopService gRPC service."""
37
+
38
+ def SetAutostop( # type: ignore[return]
39
+ self, request: autostopv1_pb2.SetAutostopRequest,
40
+ context: grpc.ServicerContext
41
+ ) -> autostopv1_pb2.SetAutostopResponse:
42
+ """Sets autostop configuration for the cluster."""
43
+ try:
44
+ wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
45
+ request.wait_for)
46
+ autostop_lib.set_autostop(
47
+ idle_minutes=request.idle_minutes,
48
+ backend=request.backend,
49
+ wait_for=wait_for if wait_for is not None else
50
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
51
+ down=request.down)
52
+ return autostopv1_pb2.SetAutostopResponse()
53
+ except Exception as e: # pylint: disable=broad-except
54
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
55
+
56
+ def IsAutostopping( # type: ignore[return]
57
+ self, request: autostopv1_pb2.IsAutostoppingRequest,
58
+ context: grpc.ServicerContext
59
+ ) -> autostopv1_pb2.IsAutostoppingResponse:
60
+ """Checks if the cluster is currently autostopping."""
61
+ try:
62
+ is_autostopping = autostop_lib.get_is_autostopping()
63
+ return autostopv1_pb2.IsAutostoppingResponse(
64
+ is_autostopping=is_autostopping)
65
+ except Exception as e: # pylint: disable=broad-except
66
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
67
+
68
+
69
+ class ServeServiceImpl(servev1_pb2_grpc.ServeServiceServicer):
70
+ """Implementation of the ServeService gRPC service."""
71
+
72
+ # NOTE (kyuds): this grpc service will run cluster-side,
73
+ # thus guaranteeing that SERVE_VERSION is above 5.
74
+ # Therefore, we removed some SERVE_VERSION checks
75
+ # present in the original codegen.
76
+
77
+ def GetServiceStatus( # type: ignore[return]
78
+ self, request: servev1_pb2.GetServiceStatusRequest,
79
+ context: grpc.ServicerContext
80
+ ) -> servev1_pb2.GetServiceStatusResponse:
81
+ """Gets serve status."""
82
+ try:
83
+ service_names, pool = (
84
+ serve_rpc_utils.GetServiceStatusRequestConverter.from_proto(request)) # pylint: disable=line-too-long
85
+ statuses = serve_utils.get_service_status_pickled(
86
+ service_names, pool)
87
+ return serve_rpc_utils.GetServiceStatusResponseConverter.to_proto(
88
+ statuses)
89
+ except Exception as e: # pylint: disable=broad-except
90
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
91
+
92
+ def AddVersion( # type: ignore[return]
93
+ self, request: servev1_pb2.AddVersionRequest,
94
+ context: grpc.ServicerContext) -> servev1_pb2.AddVersionResponse:
95
+ """Adds serve version"""
96
+ try:
97
+ service_name = request.service_name
98
+ version = serve_state.add_version(service_name)
99
+ return servev1_pb2.AddVersionResponse(version=version)
100
+ except Exception as e: # pylint: disable=broad-except
101
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
102
+
103
+ def TerminateServices( # type: ignore[return]
104
+ self, request: servev1_pb2.TerminateServicesRequest,
105
+ context: grpc.ServicerContext
106
+ ) -> servev1_pb2.TerminateServicesResponse:
107
+ """Terminates serve"""
108
+ try:
109
+ service_names, purge, pool = (
110
+ serve_rpc_utils.TerminateServicesRequestConverter.from_proto(request)) # pylint: disable=line-too-long
111
+ message = serve_utils.terminate_services(service_names, purge, pool)
112
+ return servev1_pb2.TerminateServicesResponse(message=message)
113
+ except Exception as e: # pylint: disable=broad-except
114
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
115
+
116
+ def TerminateReplica( # type: ignore[return]
117
+ self, request: servev1_pb2.TerminateReplicaRequest,
118
+ context: grpc.ServicerContext
119
+ ) -> servev1_pb2.TerminateReplicaResponse:
120
+ """Terminate replica"""
121
+ try:
122
+ service_name = request.service_name
123
+ replica_id = request.replica_id
124
+ purge = request.purge
125
+ message = serve_utils.terminate_replica(service_name, replica_id,
126
+ purge)
127
+ return servev1_pb2.TerminateReplicaResponse(message=message)
128
+ except Exception as e: # pylint: disable=broad-except
129
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
130
+
131
+ def WaitServiceRegistration( # type: ignore[return]
132
+ self, request: servev1_pb2.WaitServiceRegistrationRequest,
133
+ context: grpc.ServicerContext
134
+ ) -> servev1_pb2.WaitServiceRegistrationResponse:
135
+ """Wait for service to be registered"""
136
+ try:
137
+ service_name = request.service_name
138
+ job_id = request.job_id
139
+ pool = request.pool
140
+ encoded = serve_utils.wait_service_registration(
141
+ service_name, job_id, pool)
142
+ lb_port = serve_utils.load_service_initialization_result(encoded)
143
+ return servev1_pb2.WaitServiceRegistrationResponse(lb_port=lb_port)
144
+ except Exception as e: # pylint: disable=broad-except
145
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
146
+
147
+ def UpdateService( # type: ignore[return]
148
+ self, request: servev1_pb2.UpdateServiceRequest,
149
+ context: grpc.ServicerContext) -> servev1_pb2.UpdateServiceResponse:
150
+ """Update service"""
151
+ try:
152
+ service_name = request.service_name
153
+ version = request.version
154
+ mode = request.mode
155
+ pool = request.pool
156
+ serve_utils.update_service_encoded(service_name, version, mode,
157
+ pool)
158
+ return servev1_pb2.UpdateServiceResponse()
159
+ except Exception as e: # pylint: disable=broad-except
160
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
161
+
162
+
163
+ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
164
+ """Implementation of the JobsService gRPC service."""
165
+
166
+ def AddJob( # type: ignore[return]
167
+ self, request: jobsv1_pb2.AddJobRequest,
168
+ context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
169
+ try:
170
+ job_name = request.job_name if request.HasField('job_name') else '-'
171
+ job_id, log_dir = job_lib.add_job(job_name, request.username,
172
+ request.run_timestamp,
173
+ request.resources_str,
174
+ request.metadata)
175
+ return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
176
+ except Exception as e: # pylint: disable=broad-except
177
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
178
+
179
+ def QueueJob( # type: ignore[return]
180
+ self, request: jobsv1_pb2.QueueJobRequest,
181
+ context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
182
+ try:
183
+ job_id = request.job_id
184
+ # Create log directory and file
185
+ remote_log_dir = os.path.expanduser(request.remote_log_dir)
186
+ os.makedirs(remote_log_dir, exist_ok=True)
187
+ remote_log_path = os.path.join(remote_log_dir, 'run.log')
188
+ open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
189
+
190
+ script_path = os.path.expanduser(request.script_path)
191
+ os.makedirs(os.path.dirname(script_path), exist_ok=True)
192
+
193
+ # If `codegen` is not provided, assume script is already
194
+ # uploaded to `script_path` via rsync.
195
+ if request.HasField('codegen'):
196
+ with open(script_path, 'w', encoding='utf-8') as f:
197
+ f.write(request.codegen)
198
+ os.chmod(script_path, 0o755)
199
+
200
+ cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
201
+ job_submit_cmd = (
202
+ # JOB_CMD_IDENTIFIER is used for identifying the process
203
+ # retrieved with pid is the same driver process.
204
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
205
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
206
+ # Do not use &>, which is not POSIX and may not work.
207
+ # Note that the order of ">filename 2>&1" matters.
208
+ f' > {remote_log_path} 2>&1')
209
+ job_lib.scheduler.queue(job_id, job_submit_cmd)
210
+
211
+ if request.HasField('managed_job'):
212
+ managed_job = request.managed_job
213
+ pool = managed_job.pool if managed_job.HasField(
214
+ 'pool') else None
215
+ pool_hash = None
216
+ if pool is not None:
217
+ pool_hash = serve_state.get_service_hash(pool)
218
+ # Add the managed job to job queue database.
219
+ user_id = managed_job.user_id if managed_job.HasField(
220
+ 'user_id') else None
221
+ managed_job_state.set_job_info(job_id, managed_job.name,
222
+ managed_job.workspace,
223
+ managed_job.entrypoint, pool,
224
+ pool_hash, user_id)
225
+ # Set the managed job to PENDING state to make sure that
226
+ # this managed job appears in the `sky jobs queue`, even
227
+ # if it needs to wait to be submitted.
228
+ # We cannot set the managed job to PENDING state in the
229
+ # job template (jobs-controller.yaml.j2), as it may need
230
+ # to wait for the run commands to be scheduled on the job
231
+ # controller in high-load cases.
232
+ for task in managed_job.tasks:
233
+ managed_job_state.set_pending(job_id, task.task_id,
234
+ task.name, task.resources_str,
235
+ task.metadata_json)
236
+ return jobsv1_pb2.QueueJobResponse()
237
+ except Exception as e: # pylint: disable=broad-except
238
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
239
+
240
+ def UpdateStatus( # type: ignore[return]
241
+ self, request: jobsv1_pb2.UpdateStatusRequest,
242
+ context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
243
+ try:
244
+ job_lib.update_status()
245
+ return jobsv1_pb2.UpdateStatusResponse()
246
+ except Exception as e: # pylint: disable=broad-except
247
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
248
+
249
+ def GetJobQueue( # type: ignore[return]
250
+ self, request: jobsv1_pb2.GetJobQueueRequest,
251
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
252
+ try:
253
+ user_hash = request.user_hash if request.HasField(
254
+ 'user_hash') else None
255
+ all_jobs = request.all_jobs
256
+ jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
257
+ all_jobs=all_jobs)
258
+ return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
259
+ except Exception as e: # pylint: disable=broad-except
260
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
261
+
262
+ def CancelJobs( # type: ignore[return]
263
+ self, request: jobsv1_pb2.CancelJobsRequest,
264
+ context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
265
+ try:
266
+ job_ids = list(request.job_ids) if request.job_ids else []
267
+ user_hash = request.user_hash if request.HasField(
268
+ 'user_hash') else None
269
+ cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
270
+ user_hash)
271
+ return jobsv1_pb2.CancelJobsResponse(
272
+ cancelled_job_ids=cancelled_job_ids)
273
+ except Exception as e: # pylint: disable=broad-except
274
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
275
+
276
+ def FailAllInProgressJobs( # type: ignore[return]
277
+ self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
278
+ context: grpc.ServicerContext
279
+ ) -> jobsv1_pb2.FailAllInProgressJobsResponse:
280
+ try:
281
+ job_lib.fail_all_jobs_in_progress()
282
+ return jobsv1_pb2.FailAllInProgressJobsResponse()
283
+ except Exception as e: # pylint: disable=broad-except
284
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
285
+
286
+ def TailLogs(
287
+ self,
288
+ request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
289
+ context: grpc.ServicerContext):
290
+ buffer = log_lib.LogBuffer()
291
+ try:
292
+ job_id = request.job_id if request.HasField(
293
+ 'job_id') else job_lib.get_latest_job_id()
294
+ managed_job_id = request.managed_job_id if request.HasField(
295
+ 'managed_job_id') else None
296
+ log_dir = job_lib.get_log_dir_for_job(job_id)
297
+ if log_dir is None:
298
+ run_timestamp = job_lib.get_run_timestamp(job_id)
299
+ log_dir = None if run_timestamp is None else os.path.join(
300
+ constants.SKY_LOGS_DIRECTORY, run_timestamp)
301
+
302
+ for line in log_lib.buffered_iter_with_timeout(
303
+ buffer,
304
+ log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
305
+ request.follow, request.tail),
306
+ DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
307
+ yield jobsv1_pb2.TailLogsResponse(log_line=line)
308
+
309
+ job_status = job_lib.get_status(job_id)
310
+ exit_code = exceptions.JobExitCode.from_job_status(job_status)
311
+ # Fix for dashboard: When follow=False and job is still running
312
+ # (NOT_FINISHED=101), exit with success (0) since fetching current
313
+ # logs is a successful operation.
314
+ # This prevents shell wrappers from printing "command terminated
315
+ # with exit code 101".
316
+ exit_code_int = 0 if not request.follow and int(
317
+ exit_code) == 101 else int(exit_code)
318
+ yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
319
+ except Exception as e: # pylint: disable=broad-except
320
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
321
+ finally:
322
+ buffer.close()
323
+
324
+ def GetJobStatus( # type: ignore[return]
325
+ self, request: jobsv1_pb2.GetJobStatusRequest,
326
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
327
+ try:
328
+ if request.job_ids:
329
+ job_ids = list(request.job_ids)
330
+ else:
331
+ latest_job_id = job_lib.get_latest_job_id()
332
+ job_ids = [latest_job_id] if latest_job_id is not None else []
333
+ job_statuses = job_lib.get_statuses(job_ids)
334
+ for job_id, status in job_statuses.items():
335
+ job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
336
+ ) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
337
+ return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
338
+ except Exception as e: # pylint: disable=broad-except
339
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
340
+
341
+ def GetJobSubmittedTimestamp( # type: ignore[return]
342
+ self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
343
+ context: grpc.ServicerContext
344
+ ) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
345
+ try:
346
+ job_id = request.job_id if request.HasField(
347
+ 'job_id') else job_lib.get_latest_job_id()
348
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
349
+ job_id, False)
350
+ if timestamp is None:
351
+ context.abort(grpc.StatusCode.NOT_FOUND,
352
+ f'Job {job_id} not found')
353
+ return jobsv1_pb2.GetJobSubmittedTimestampResponse(
354
+ timestamp=timestamp)
355
+ except Exception as e: # pylint: disable=broad-except
356
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
357
+
358
+ def GetJobEndedTimestamp( # type: ignore[return]
359
+ self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
360
+ context: grpc.ServicerContext
361
+ ) -> jobsv1_pb2.GetJobEndedTimestampResponse:
362
+ try:
363
+ job_id = request.job_id if request.HasField(
364
+ 'job_id') else job_lib.get_latest_job_id()
365
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
366
+ job_id, True)
367
+ if timestamp is None:
368
+ context.abort(grpc.StatusCode.NOT_FOUND,
369
+ f'Job {job_id} not found or not ended')
370
+ return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
371
+ except Exception as e: # pylint: disable=broad-except
372
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
373
+
374
+ def GetLogDirsForJobs( # type: ignore[return]
375
+ self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
376
+ context: grpc.ServicerContext
377
+ ) -> jobsv1_pb2.GetLogDirsForJobsResponse:
378
+ try:
379
+ if request.job_ids:
380
+ job_ids = list(request.job_ids)
381
+ else:
382
+ latest_job_id = job_lib.get_latest_job_id()
383
+ job_ids = [latest_job_id] if latest_job_id is not None else []
384
+ job_log_dirs = job_lib.get_job_log_dirs(job_ids)
385
+ return jobsv1_pb2.GetLogDirsForJobsResponse(
386
+ job_log_dirs=job_log_dirs)
387
+ except Exception as e: # pylint: disable=broad-except
388
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
389
+
390
+
391
+ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
392
+ ):
393
+ """Implementation of the ManagedJobsService gRPC service."""
394
+
395
+ def GetVersion( # type: ignore[return]
396
+ self, request: managed_jobsv1_pb2.GetVersionRequest,
397
+ context: grpc.ServicerContext
398
+ ) -> managed_jobsv1_pb2.GetVersionResponse:
399
+ try:
400
+ return managed_jobsv1_pb2.GetVersionResponse(
401
+ controller_version=constants.SKYLET_VERSION)
402
+ except Exception as e: # pylint: disable=broad-except
403
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
404
+
405
+ def GetJobTable( # type: ignore[return]
406
+ self, request: managed_jobsv1_pb2.GetJobTableRequest,
407
+ context: grpc.ServicerContext
408
+ ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
+ try:
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
413
+ job_ids = (list(request.job_ids.ids)
414
+ if request.HasField('job_ids') else None)
415
+ user_hashes: Optional[List[Optional[str]]] = None
416
+ if request.HasField('user_hashes'):
417
+ user_hashes = list(request.user_hashes.hashes)
418
+ # For backwards compatibility, we show jobs that do not have a
419
+ # user_hash. TODO: Remove before 0.12.0.
420
+ if request.show_jobs_without_user_hash:
421
+ user_hashes.append(None)
422
+ statuses = (list(request.statuses.statuses)
423
+ if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
426
+ job_queue = managed_job_utils.get_managed_job_queue(
427
+ skip_finished=request.skip_finished,
428
+ accessible_workspaces=accessible_workspaces,
429
+ job_ids=job_ids,
430
+ workspace_match=request.workspace_match
431
+ if request.HasField('workspace_match') else None,
432
+ name_match=request.name_match
433
+ if request.HasField('name_match') else None,
434
+ pool_match=request.pool_match
435
+ if request.HasField('pool_match') else None,
436
+ page=request.page if request.HasField('page') else None,
437
+ limit=request.limit if request.HasField('limit') else None,
438
+ user_hashes=user_hashes,
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
442
+ jobs = job_queue['jobs']
443
+ total = job_queue['total']
444
+ total_no_filter = job_queue['total_no_filter']
445
+ status_counts = job_queue['status_counts']
446
+
447
+ jobs_info = []
448
+ for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
455
+ schedule_state = job.get('schedule_state')
456
+ if schedule_state is not None:
457
+ schedule_state = managed_job_state.ManagedJobScheduleState(
458
+ schedule_state).to_protobuf()
459
+ job_info = managed_jobsv1_pb2.ManagedJobInfo(
460
+ # The `spot.job_id`, which can be used to identify
461
+ # different tasks for the same job
462
+ _job_id=job.get('_job_id'),
463
+ job_id=job.get('job_id'),
464
+ task_id=job.get('task_id'),
465
+ job_name=job.get('job_name'),
466
+ task_name=job.get('task_name'),
467
+ job_duration=job.get('job_duration'),
468
+ workspace=job.get('workspace'),
469
+ status=managed_job_state.ManagedJobStatus(
470
+ job.get('status')).to_protobuf(),
471
+ schedule_state=schedule_state,
472
+ resources=job.get('resources'),
473
+ cluster_resources=job.get('cluster_resources'),
474
+ cluster_resources_full=job.get('cluster_resources_full'),
475
+ cloud=job.get('cloud'),
476
+ region=job.get('region'),
477
+ infra=job.get('infra'),
478
+ accelerators=job.get('accelerators'),
479
+ recovery_count=job.get('recovery_count'),
480
+ details=job.get('details'),
481
+ failure_reason=job.get('failure_reason'),
482
+ user_name=job.get('user_name'),
483
+ user_hash=job.get('user_hash'),
484
+ submitted_at=job.get('submitted_at'),
485
+ start_at=job.get('start_at'),
486
+ end_at=job.get('end_at'),
487
+ user_yaml=job.get('user_yaml'),
488
+ entrypoint=job.get('entrypoint'),
489
+ metadata=converted_metadata,
490
+ pool=job.get('pool'),
491
+ pool_hash=job.get('pool_hash'))
492
+ jobs_info.append(job_info)
493
+
494
+ return managed_jobsv1_pb2.GetJobTableResponse(
495
+ jobs=jobs_info,
496
+ total=total,
497
+ total_no_filter=total_no_filter,
498
+ status_counts=status_counts)
499
+ except Exception as e: # pylint: disable=broad-except
500
+ logger.error(e, exc_info=True)
501
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
502
+
503
+ def GetAllJobIdsByName( # type: ignore[return]
504
+ self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
505
+ context: grpc.ServicerContext
506
+ ) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
507
+ try:
508
+ job_name = request.job_name if request.HasField(
509
+ 'job_name') else None
510
+ job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
511
+ return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
512
+ job_ids=job_ids)
513
+ except Exception as e: # pylint: disable=broad-except
514
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
515
+
516
+ def CancelJobs( # type: ignore[return]
517
+ self, request: managed_jobsv1_pb2.CancelJobsRequest,
518
+ context: grpc.ServicerContext
519
+ ) -> managed_jobsv1_pb2.CancelJobsResponse:
520
+ try:
521
+ cancellation_criteria = request.WhichOneof('cancellation_criteria')
522
+ if cancellation_criteria is None:
523
+ context.abort(
524
+ grpc.StatusCode.INVALID_ARGUMENT,
525
+ 'exactly one cancellation criteria must be specified.')
526
+
527
+ if cancellation_criteria == 'all_users':
528
+ user_hash = request.user_hash if request.HasField(
529
+ 'user_hash') else None
530
+ all_users = request.all_users
531
+ if not all_users and user_hash is None:
532
+ context.abort(
533
+ grpc.StatusCode.INVALID_ARGUMENT,
534
+ 'user_hash is required when all_users is False')
535
+ message = managed_job_utils.cancel_jobs_by_id(
536
+ job_ids=None,
537
+ all_users=all_users,
538
+ current_workspace=request.current_workspace,
539
+ user_hash=user_hash)
540
+ elif cancellation_criteria == 'job_ids':
541
+ job_ids = list(request.job_ids.ids)
542
+ message = managed_job_utils.cancel_jobs_by_id(
543
+ job_ids=job_ids,
544
+ current_workspace=request.current_workspace)
545
+ elif cancellation_criteria == 'job_name':
546
+ message = managed_job_utils.cancel_job_by_name(
547
+ job_name=request.job_name,
548
+ current_workspace=request.current_workspace)
549
+ elif cancellation_criteria == 'pool_name':
550
+ message = managed_job_utils.cancel_jobs_by_pool(
551
+ pool_name=request.pool_name,
552
+ current_workspace=request.current_workspace)
553
+ else:
554
+ context.abort(
555
+ grpc.StatusCode.INVALID_ARGUMENT,
556
+ f'invalid cancellation criteria: {cancellation_criteria}')
557
+ return managed_jobsv1_pb2.CancelJobsResponse(message=message)
558
+ except Exception as e: # pylint: disable=broad-except
559
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
560
+
561
+ def StreamLogs(
562
+ self,
563
+ request: managed_jobsv1_pb2.
564
+ StreamLogsRequest, # type: ignore[return]
565
+ context: grpc.ServicerContext):
566
+ # TODO(kevin): implement this
567
+ context.abort(grpc.StatusCode.UNIMPLEMENTED,
568
+ 'StreamLogs is not implemented')
sky/skylet/skylet.py CHANGED
@@ -1,11 +1,21 @@
1
1
  """skylet: a daemon running on the head node of a cluster."""
2
2
 
3
+ import argparse
4
+ import concurrent.futures
5
+ import os
3
6
  import time
4
7
 
8
+ import grpc
9
+
5
10
  import sky
6
11
  from sky import sky_logging
12
+ from sky.schemas.generated import autostopv1_pb2_grpc
13
+ from sky.schemas.generated import jobsv1_pb2_grpc
14
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
15
+ from sky.schemas.generated import servev1_pb2_grpc
7
16
  from sky.skylet import constants
8
17
  from sky.skylet import events
18
+ from sky.skylet import services
9
19
 
10
20
  # Use the explicit logger name so that the logger is under the
11
21
  # `sky.skylet.skylet` namespace when executed directly, so as
@@ -24,12 +34,70 @@ EVENTS = [
24
34
  # This is for monitoring controller job status. If it becomes
25
35
  # unhealthy, this event will correctly update the controller
26
36
  # status to CONTROLLER_FAILED.
27
- events.ServiceUpdateEvent(),
37
+ events.ServiceUpdateEvent(pool=False),
38
+ # Status refresh for pool.
39
+ events.ServiceUpdateEvent(pool=True),
28
40
  # Report usage heartbeat every 10 minutes.
29
41
  events.UsageHeartbeatReportEvent(),
30
42
  ]
31
43
 
32
- while True:
33
- time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
44
+
45
+ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
46
+ """Start the gRPC server."""
47
+ # This is the default value in Python 3.8 - 3.12,
48
+ # putting it here for visibility.
49
+ # TODO(kevin): Determine the optimal max number of threads.
50
+ max_workers = min(32, (os.cpu_count() or 1) + 4)
51
+ server = grpc.server(
52
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
53
+
54
+ autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
55
+ services.AutostopServiceImpl(), server)
56
+ jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
57
+ services.JobsServiceImpl(), server)
58
+ servev1_pb2_grpc.add_ServeServiceServicer_to_server(
59
+ services.ServeServiceImpl(), server)
60
+ managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
61
+ services.ManagedJobsServiceImpl(), server)
62
+
63
+ listen_addr = f'127.0.0.1:{port}'
64
+ server.add_insecure_port(listen_addr)
65
+
66
+ server.start()
67
+ logger.info(f'gRPC server started on {listen_addr}')
68
+
69
+ return server
70
+
71
+
72
+ def run_event_loop():
73
+ """Run the existing event loop."""
74
+
34
75
  for event in EVENTS:
35
- event.run()
76
+ event.start()
77
+
78
+ while True:
79
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
80
+ for event in EVENTS:
81
+ event.run()
82
+
83
+
84
+ def main():
85
+ parser = argparse.ArgumentParser(description='Start skylet daemon')
86
+ parser.add_argument('--port',
87
+ type=int,
88
+ default=constants.SKYLET_GRPC_PORT,
89
+ help=f'gRPC port to listen on (default: '
90
+ f'{constants.SKYLET_GRPC_PORT})')
91
+ args = parser.parse_args()
92
+
93
+ grpc_server = start_grpc_server(port=args.port)
94
+ try:
95
+ run_event_loop()
96
+ except KeyboardInterrupt:
97
+ logger.info('Shutting down skylet...')
98
+ finally:
99
+ grpc_server.stop(grace=5)
100
+
101
+
102
+ if __name__ == '__main__':
103
+ main()