skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,185 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>SkyPilot API Server Login</title>
7
+ <style>
8
+ body {
9
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
10
+ display: flex;
11
+ flex-direction: column;
12
+ align-items: center;
13
+ justify-content: center;
14
+ min-height: 100vh;
15
+ margin: 0;
16
+ background-color: #f8f9fa;
17
+ color: #202124;
18
+ padding: 20px;
19
+ box-sizing: border-box;
20
+ }
21
+ .container {
22
+ background-color: #ffffff;
23
+ padding: 48px;
24
+ border-radius: 8px;
25
+ box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
26
+ text-align: center;
27
+ max-width: 600px;
28
+ width: 100%;
29
+ }
30
+ .logo {
31
+ width: 64px;
32
+ height: 64px;
33
+ margin-bottom: 20px;
34
+ display: inline-block;
35
+ }
36
+ .logo svg {
37
+ width: 100%;
38
+ height: 100%;
39
+ }
40
+ h1 {
41
+ font-size: 24px;
42
+ font-weight: 500;
43
+ margin-bottom: 20px;
44
+ color: #202124;
45
+ }
46
+ p {
47
+ font-size: 14px;
48
+ line-height: 1.5;
49
+ margin-bottom: 20px;
50
+ color: #5f6368;
51
+ }
52
+ .user-identifier {
53
+ font-size: 12px; /* Smaller font size */
54
+ color: #80868b; /* Lighter color */
55
+ margin-bottom: 8px; /* Adjusted margin */
56
+ }
57
+ .code-block {
58
+ background-color: #f1f3f4;
59
+ border: 1px solid #dadce0;
60
+ border-radius: 4px;
61
+ padding: 16px;
62
+ margin-top: 24px;
63
+ margin-bottom: 24px;
64
+ margin-left: auto;
65
+ margin-right: auto;
66
+ text-align: left;
67
+ word-break: break-all;
68
+ white-space: pre-wrap;
69
+ font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
70
+ font-size: 13px;
71
+ line-height: 1.4;
72
+ max-width: 480px;
73
+ }
74
+ #token-box { /* Specifically for the token */
75
+ height: auto;
76
+ min-height: 6em; /* Ensure it's a reasonable size */
77
+ max-height: 15em; /* Prevent it from getting too large */
78
+ overflow-y: auto;
79
+ }
80
+ .copy-button {
81
+ background-color: #1a73e8;
82
+ color: white;
83
+ border: none;
84
+ border-radius: 4px;
85
+ padding: 10px 24px;
86
+ font-size: 14px;
87
+ font-weight: 500;
88
+ cursor: pointer;
89
+ transition: background-color 0.3s;
90
+ margin-top: 10px;
91
+ }
92
+ .copy-button:hover {
93
+ background-color: #287ae6;
94
+ }
95
+ .copy-button:active {
96
+ background-color: #1b66c9;
97
+ }
98
+ .footer-text {
99
+ font-size: 12px;
100
+ color: #5f6368;
101
+ margin-top: 30px;
102
+ }
103
+ .local-port-info {
104
+ display: none;
105
+ }
106
+ </style>
107
+ </head>
108
+ <body>
109
+ <div class="container">
110
+ <div class="logo">
111
+ <!-- SkyPilot Logo Icon -->
112
+ <svg viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
113
+ <path d="M25.1258 30.8274L19.2842 31.6783L33.8316 46.2268L31.492 37.1925L25.1258 30.8274Z" fill="#372F8A"/>
114
+ <path d="M46.9433 0.000976562L0.719727 13.1148L15.2661 27.6601L16.633 21.3925L10.3728 15.1323L40.183 6.74118C40.183 6.74118 46.102 0.855027 46.9444 0.00203721L46.9433 0.000976562Z" fill="#372F8A"/>
115
+ <path d="M40.1821 6.74021L31.4922 37.1925L33.8318 46.2257L46.9445 0C46.1022 0.85299 40.1831 6.73915 40.1831 6.73915L40.1821 6.74021Z" fill="#372F8A"/>
116
+ <path d="M21.3356 25.6089L19.2842 31.6783L25.1258 30.8275L30.3741 16.6011L30.3275 16.617L21.3356 25.6089Z" fill="#195D7F"/>
117
+ <path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
118
+ </svg>
119
+ </div>
120
+ <h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
121
+ <h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
122
+ <p class="user-identifier">USER_PLACEHOLDER</p>
123
+ <!-- display token info by default -->
124
+ <p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
125
+ <p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
126
+ <div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
127
+ <button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
128
+ <p class="footer-text no-local-port">You can close this tab after copying the token.</p>
129
+
130
+ <!-- don't display local port info unless successful -->
131
+ <p class="local-port-info">You can now close this tab.</p>
132
+ </div>
133
+
134
+ <script>
135
+ const tokenBox = document.getElementById('token-box');
136
+ const copyBtn = document.getElementById('copy-btn');
137
+
138
+ function selectToken() {
139
+ // For <pre> or <div>, create a range to select its content
140
+ const range = document.createRange();
141
+ range.selectNodeContents(tokenBox);
142
+ const sel = window.getSelection();
143
+ sel.removeAllRanges();
144
+ sel.addRange(range);
145
+ }
146
+
147
+ // Optional: Select the token when the page loads or when token box is clicked
148
+ tokenBox.addEventListener('click', selectToken);
149
+ window.addEventListener('load', selectToken);
150
+
151
+ copyBtn.addEventListener('click', () => {
152
+ selectToken(); // Select the text
153
+ try {
154
+ document.execCommand('copy');
155
+ copyBtn.textContent = 'Copied!';
156
+ } catch (err) {
157
+ copyBtn.textContent = 'Error!';
158
+ console.error('Failed to copy text: ', err);
159
+ }
160
+ setTimeout(() => {
161
+ copyBtn.textContent = 'Copy Token';
162
+ }, 2000);
163
+ });
164
+
165
+ function hideTokenInfo() {
166
+ const noLocalPortElems = document.querySelectorAll('.no-local-port');
167
+ noLocalPortElems.forEach(elem => {
168
+ elem.style.display = 'none';
169
+ });
170
+ const localPortInfoElems = document.querySelectorAll('.local-port-info');
171
+ localPortInfoElems.forEach(elem => {
172
+ elem.classList.remove('local-port-info');
173
+ });
174
+ }
175
+
176
+ if (window.location.search.includes('local_port=')) {
177
+ const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
178
+ fetch(uri, {
179
+ method: 'POST',
180
+ body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
181
+ }).then(hideTokenInfo)
182
+ }
183
+ </script>
184
+ </body>
185
+ </html>
sky/server/metrics.py ADDED
@@ -0,0 +1,160 @@
1
+ """Instrumentation for the API server."""
2
+
3
+ import asyncio
4
+ import multiprocessing
5
+ import os
6
+ import threading
7
+ import time
8
+ from typing import List
9
+
10
+ import fastapi
11
+ from prometheus_client import generate_latest
12
+ from prometheus_client import multiprocess
13
+ import prometheus_client as prom
14
+ import psutil
15
+ import starlette.middleware.base
16
+ import uvicorn
17
+
18
+ from sky import core
19
+ from sky import sky_logging
20
+ from sky.metrics import utils as metrics_utils
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ metrics_app = fastapi.FastAPI()
25
+
26
+
27
+ # Serve /metrics in dedicated thread to avoid blocking the event loop
28
+ # of metrics server.
29
+ @metrics_app.get('/metrics')
30
+ def metrics() -> fastapi.Response:
31
+ """Expose aggregated Prometheus metrics from all worker processes."""
32
+ if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
33
+ # In multiprocess mode, we need to collect metrics from all processes.
34
+ registry = prom.CollectorRegistry()
35
+ multiprocess.MultiProcessCollector(registry)
36
+ data = generate_latest(registry)
37
+ else:
38
+ data = generate_latest()
39
+ return fastapi.Response(content=data,
40
+ media_type=prom.CONTENT_TYPE_LATEST,
41
+ headers={'Cache-Control': 'no-cache'})
42
+
43
+
44
+ @metrics_app.get('/gpu-metrics')
45
+ async def gpu_metrics() -> fastapi.Response:
46
+ """Gets the GPU metrics from multiple external k8s clusters"""
47
+ contexts = core.get_all_contexts()
48
+ all_metrics: List[str] = []
49
+ successful_contexts = 0
50
+
51
+ tasks = [
52
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
53
+ for context in contexts
54
+ if context != 'in-cluster'
55
+ ]
56
+
57
+ results = await asyncio.gather(*tasks, return_exceptions=True)
58
+
59
+ for i, result in enumerate(results):
60
+ if isinstance(result, Exception):
61
+ logger.error(
62
+ f'Failed to get metrics for context {contexts[i]}: {result}')
63
+ elif isinstance(result, BaseException):
64
+ # Avoid changing behavior for non-Exception BaseExceptions
65
+ # like KeyboardInterrupt/SystemExit: re-raise them.
66
+ raise result
67
+ else:
68
+ metrics_text = result
69
+ all_metrics.append(metrics_text)
70
+ successful_contexts += 1
71
+
72
+ combined_metrics = '\n\n'.join(all_metrics)
73
+
74
+ # Return as plain text for Prometheus compatibility
75
+ return fastapi.Response(
76
+ content=combined_metrics,
77
+ media_type='text/plain; version=0.0.4; charset=utf-8')
78
+
79
+
80
+ def build_metrics_server(host: str, port: int) -> uvicorn.Server:
81
+ metrics_config = uvicorn.Config(
82
+ 'sky.server.metrics:metrics_app',
83
+ host=host,
84
+ port=port,
85
+ workers=1,
86
+ )
87
+ metrics_server_instance = uvicorn.Server(metrics_config)
88
+ return metrics_server_instance
89
+
90
+
91
+ def _get_status_code_group(status_code: int) -> str:
92
+ """Group status codes into classes (2xx, 5xx) to reduce cardinality."""
93
+ return f'{status_code // 100}xx'
94
+
95
+
96
+ def _is_streaming_api(path: str) -> bool:
97
+ """Check if the path is a streaming API."""
98
+ path = path.rstrip('/')
99
+ return path.endswith('/logs') or path.endswith('/api/stream')
100
+
101
+
102
+ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
103
+ """Middleware to collect Prometheus metrics for HTTP requests."""
104
+
105
+ async def dispatch(self, request: fastapi.Request, call_next):
106
+ path = request.url.path
107
+ logger.debug(f'PROM Middleware Request: {request}, {request.url.path}')
108
+ streaming = _is_streaming_api(path)
109
+ if not streaming:
110
+ # Exclude streaming APIs, the duration is not meaningful.
111
+ # TODO(aylei): measure the duration of async execution instead.
112
+ start_time = time.time()
113
+ method = request.method
114
+ status_code_group = ''
115
+
116
+ try:
117
+ response = await call_next(request)
118
+ status_code_group = _get_status_code_group(response.status_code)
119
+ except Exception: # pylint: disable=broad-except
120
+ status_code_group = '5xx'
121
+ raise
122
+ finally:
123
+ metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
124
+ path=path, method=method, status=status_code_group).inc()
125
+ if not streaming:
126
+ duration = time.time() - start_time
127
+ metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
128
+ path=path, method=method,
129
+ status=status_code_group).observe(duration)
130
+
131
+ return response
132
+
133
+
134
+ peak_rss_bytes = 0
135
+
136
+
137
+ def process_monitor(process_type: str, stop: threading.Event):
138
+ pid = multiprocessing.current_process().pid
139
+ proc = psutil.Process(pid)
140
+ last_bucket_end = time.time()
141
+ bucket_peak = 0
142
+ global peak_rss_bytes
143
+ while not stop.is_set():
144
+ if time.time() - last_bucket_end >= 30:
145
+ # Reset peak RSS for the next time bucket.
146
+ last_bucket_end = time.time()
147
+ bucket_peak = 0
148
+ peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
149
+ metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
150
+ pid=pid, type=process_type).set(peak_rss_bytes)
151
+ ctimes = proc.cpu_times()
152
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
153
+ type=process_type,
154
+ mode='user').set(
155
+ ctimes.user)
156
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
157
+ type=process_type,
158
+ mode='system').set(
159
+ ctimes.system)
160
+ time.sleep(1)
@@ -0,0 +1,166 @@
1
+ """Utilities for building middlewares."""
2
+ import enum
3
+ import http
4
+ from typing import Type
5
+
6
+ import fastapi
7
+ import starlette.middleware.base
8
+ import starlette.types
9
+
10
+ from sky import sky_logging
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ class WebSocketDecision(enum.Enum):
16
+ ACCEPT = 'accept'
17
+ UNAUTHORIZED = 'unauthorized'
18
+ FORBIDDEN = 'forbidden'
19
+ ERROR = 'error'
20
+
21
+
22
+ def websocket_aware(
23
+ middleware_cls: Type[starlette.middleware.base.BaseHTTPMiddleware]):
24
+ """Decorator to adapt BaseHTTPMiddleware to handle WebSockets.
25
+
26
+ It assembles an HTTP-style request like the HTTP upgrade request during
27
+ websocket handshake and then delegates it to the real HTTP middleware.
28
+ The websocket connection will be rejected if the HTTP middleware returns
29
+ a 4xx or 5xx status code.
30
+
31
+ Note: for websocket connection, the mutation made by the underlying HTTP
32
+ middleware on the request and response will be discarded.
33
+ """
34
+
35
+ class WebSocketAwareMiddleware:
36
+ """WebSocket-aware middleware wrapper."""
37
+
38
+ def __init__(self, app: starlette.types.ASGIApp, *args, **kwargs):
39
+ self.app = app
40
+ self.middleware = middleware_cls(app, *args, **kwargs)
41
+
42
+ async def __call__(self, scope: starlette.types.Scope,
43
+ receive: starlette.types.Receive,
44
+ send: starlette.types.Send):
45
+ scope_type = scope.get('type')
46
+ if scope_type == 'websocket':
47
+ await self._handle_websocket(scope, receive, send)
48
+ else:
49
+ # Delegate other scopes to the underlying HTTP middleware.
50
+ await self.middleware(scope, receive, send)
51
+
52
+ async def dispatch(
53
+ self, request: fastapi.Request,
54
+ call_next: starlette.middleware.base.RequestResponseEndpoint):
55
+ """Implement dispatch method to keep compatibility."""
56
+ return await self.middleware.dispatch(request, call_next)
57
+
58
+ async def _handle_websocket(self, scope: starlette.types.Scope,
59
+ receive: starlette.types.Receive,
60
+ send: starlette.types.Send):
61
+ """Handle websocket connection by delegating to HTTP middleware."""
62
+ decision = await self._run_websocket_dispatch(scope)
63
+ if decision == WebSocketDecision.ACCEPT:
64
+ await self.app(scope, receive, send)
65
+ elif decision == WebSocketDecision.UNAUTHORIZED:
66
+ await send({
67
+ 'type': 'websocket.close',
68
+ 'code': 4401,
69
+ 'reason': 'Unauthorized',
70
+ })
71
+ elif decision == WebSocketDecision.FORBIDDEN:
72
+ await send({
73
+ 'type': 'websocket.close',
74
+ 'code': 4403,
75
+ 'reason': 'Forbidden',
76
+ })
77
+ else:
78
+ await send({
79
+ 'type': 'websocket.close',
80
+ 'code': 1011,
81
+ 'reason': 'Internal Server Error',
82
+ })
83
+
84
+ async def _run_websocket_dispatch(
85
+ self, scope: starlette.types.Scope) -> WebSocketDecision:
86
+ http_scope = self._build_http_scope(scope)
87
+ http_receive = self._http_receive_adapter()
88
+ request = fastapi.Request(http_scope, receive=http_receive)
89
+ call_next_called = False
90
+ stub_response = fastapi.Response(status_code=http.HTTPStatus.OK)
91
+
92
+ async def call_next(req):
93
+ del req
94
+ # Capture whether call_next() is called in the underlying
95
+ # HTTP middleware to determine if we can proceed with current
96
+ # websocket connection.
97
+ nonlocal call_next_called
98
+ call_next_called = True
99
+ return stub_response
100
+
101
+ try:
102
+ response = await self.dispatch(request, call_next)
103
+ except Exception as e: # pylint: disable=broad-except
104
+ logger.error('Exception occurred in middleware dispatch for '
105
+ f'WebSocket scope: {e}')
106
+ return WebSocketDecision.ERROR
107
+
108
+ if response is None:
109
+ response = stub_response
110
+
111
+ status_code = response.status_code
112
+
113
+ if call_next_called and 200 <= status_code < 400:
114
+ return WebSocketDecision.ACCEPT
115
+ if status_code == http.HTTPStatus.UNAUTHORIZED:
116
+ return WebSocketDecision.UNAUTHORIZED
117
+ if status_code == http.HTTPStatus.FORBIDDEN:
118
+ return WebSocketDecision.FORBIDDEN
119
+ return WebSocketDecision.ERROR
120
+
121
+ @staticmethod
122
+ def _build_http_scope(
123
+ scope: starlette.types.Scope) -> starlette.types.Scope:
124
+ state = scope.setdefault('state', {})
125
+ scheme = scope.get('scheme', 'ws')
126
+ if scheme == 'ws':
127
+ http_scheme = 'http'
128
+ elif scheme == 'wss':
129
+ http_scheme = 'https'
130
+ else:
131
+ http_scheme = scheme
132
+ http_scope = dict(scope)
133
+ http_scope['type'] = 'http'
134
+ http_scope['scheme'] = http_scheme
135
+ http_scope['method'] = 'GET'
136
+ http_scope['http_version'] = scope.get('http_version', '1.1')
137
+ http_scope['state'] = state
138
+ return http_scope
139
+
140
+ @staticmethod
141
+ def _http_receive_adapter() -> starlette.types.Receive:
142
+ """Adapter thatmimics the sequence produced by Starlette for an HTTP
143
+ request: a single http.request event followed by a http.disconnect
144
+ """
145
+ sent = False
146
+
147
+ async def receive():
148
+ nonlocal sent
149
+ if not sent:
150
+ sent = True
151
+ return {
152
+ 'type': 'http.request',
153
+ 'body': b'',
154
+ 'more_body': False,
155
+ }
156
+ return {
157
+ 'type': 'http.disconnect',
158
+ }
159
+
160
+ return receive
161
+
162
+ WebSocketAwareMiddleware.__name__ = middleware_cls.__name__
163
+ WebSocketAwareMiddleware.__qualname__ = middleware_cls.__qualname__
164
+ WebSocketAwareMiddleware.__module__ = middleware_cls.__module__
165
+ WebSocketAwareMiddleware.__doc__ = middleware_cls.__doc__
166
+ return WebSocketAwareMiddleware