crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,10 @@
1
+ import{r as l,a as Es,L as f,O as ys,d as Ms,u as ke,f as Ce,H as Rs,h as Is,i as D,R as Ss}from"./react-d4a0b69b.js";import{g as Y,b as X,m as ie,s as Te,a as Ls,d as ye,y as ks,c as Me,e as de,l as me}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const a of document.querySelectorAll('link[rel="modulepreload"]'))r(a);new MutationObserver(a=>{for(const i of a)if(i.type==="childList")for(const c of i.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&r(c)}).observe(document,{childList:!0,subtree:!0});function n(a){const i={};return a.integrity&&(i.integrity=a.integrity),a.referrerPolicy&&(i.referrerPolicy=a.referrerPolicy),a.crossOrigin==="use-credentials"?i.credentials="include":a.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function r(a){if(a.ep)return;a.ep=!0;const i=n(a);fetch(a.href,i)}})();var Pe={exports:{}},re={};/**
2
+ * @license React
3
+ * react-jsx-runtime.production.min.js
4
+ *
5
+ * Copyright (c) Facebook, Inc. and its affiliates.
6
+ *
7
+ * This source code is licensed under the MIT license found in the
8
+ * LICENSE file in the root directory of this source tree.
9
+ */var Cs=l,Ts=Symbol.for("react.element"),Ps=Symbol.for("react.fragment"),Bs=Object.prototype.hasOwnProperty,Ds=Cs.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Us={key:!0,ref:!0,__self:!0,__source:!0};function Be(s,t,n){var r,a={},i=null,c=null;n!==void 0&&(i=""+n),t.key!==void 0&&(i=""+t.key),t.ref!==void 0&&(c=t.ref);for(r in t)Bs.call(t,r)&&!Us.hasOwnProperty(r)&&(a[r]=t[r]);if(s&&s.defaultProps)for(r in t=s.defaultProps,t)a[r]===void 0&&(a[r]=t[r]);return{$$typeof:Ts,type:s,key:i,ref:c,props:a,_owner:Ds.current}}re.Fragment=Ps;re.jsx=Be;re.jsxs=Be;Pe.exports=re;var e=Pe.exports,ce={},Re=Es;ce.createRoot=Re.createRoot,ce.hydrateRoot=Re.hydrateRoot;function Hs({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Os=l.forwardRef(Hs),De=Os;function Fs({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const _s=l.forwardRef(Fs),zs=_s;function Vs({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Ws=l.forwardRef(Vs),qs=Ws,Ue=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,He=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Qs({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Gs=l.forwardRef(Qs),Ks=Gs;function Js({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Xs=l.forwardRef(Js),Ys=Xs;function $s({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Zs=l.forwardRef($s),et=Zs;function st({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const tt=l.forwardRef(st),Oe=tt;function nt({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const rt=l.forwardRef(nt),at=rt;function lt({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=l.forwardRef(lt),ct=it;function he(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Fe(){const[s,t]=l.useState([]),[n,r]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(a=>a.json()).then(a=>{if(t(a),window.PROJECT_ID){const i=a.find(c=>c.id===window.PROJECT_ID);r(i)}else{const i=a.find(c=>c.id==="lite");r(i)}}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(Oe,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((a,i)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:he(void 0,a.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===a.title?"underline":"",children:a.title}),": ",a.description]})},i))})]})}function A(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function $(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ot(s){try{return await(await fetch(A(`${$()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=l.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,r]=l.useState();l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(o=>o.json()).then(o=>{if(window.PROJECT_ID){const m=o.find(p=>p.id===window.PROJECT_ID);r(m)}else{const m=o.find(p=>p.id==="lite");r(m)}}).catch(o=>{console.error("Error fetching JSON:",o)})},[]);function a(){return n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"]}l.useEffect(()=>{const o=new AbortController;async function m(){const p=await ot(o.signal);t(p)}return m(),()=>o.abort()},[]);const i=a();if(!s.release&&!s.suite)return null;const c=`Release ${s.release||s.suite} (${s.date})`;return i.length<=1?e.jsx("div",{children:c}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",e.jsx(Oe,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:i.map(o=>e.jsx("li",{children:e.jsx("a",{href:he(o,n?n.id:"lite"),className:"block",role:"menuitem",children:o})},o))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Ue,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(Fe,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ht(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(f,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Ue,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(Fe,{})]})]})}function xt(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(ht,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(ys,{})})})]})}async function L(s){try{return await(await fetch(A(`${$()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function ut({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function G({value:s}){return e.jsx("span",{children:e.jsx(Ms,{components:{a:ut},children:s})})}function k({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(G,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},gt={open:"Open",limited:"Limited",closed:"Closed"};function pt({level:s}){return e.jsx(Y,{color:ft[s],children:gt[s]})}function U(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function jt(){const[s,t]=l.useState([]);l.useEffect(()=>{const c=new AbortController;async function o(){const m=await L(c.signal);t(m.models)}return o(),()=>c.abort()},[]);const[n,r,a]=s.reduce((c,o)=>{switch(o.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),i=Object.values(s.reduce((c,o)=>{const m=o.creator_organization;return c[m]===void 0?(c[m]={name:m,models:1},c):(c[m].models+=1,c)},{}));return s.length===0?e.jsx(U,{}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(G,{value:c.description})}),e.jsx("td",{children:e.jsx(pt,{level:c.access})})]}))})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(X,{className:"flex flex-col justify-between",children:[e.jsx(ie,{children:"Models"}),e.jsx(Te,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ls,{values:[n,r,a],colors:["green","yellow","red"]}),e.jsx(ye,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(X,{className:"md:col-span-2",children:[e.jsx(ie,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(ks,{data:i,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(ye,{categories:i.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function te({to:s,children:t,inTable:n=!1,title:r=""}){return n?e.jsx(f,{className:"link link-hover",to:s,title:r,children:t}):e.jsx(f,{className:"link link-primary link-hover",to:s,children:t})}function bt(){const[s,t]=l.useState([]);l.useEffect(()=>{const r=new AbortController;async function a(){const i=await L(r.signal);t(i.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return a(),()=>r.abort()},[]);const n=Object.values(s.reduce((r,a)=>{var c;const i=((c=a.taxonomy)==null?void 0:c.task)||"Unknown";return r[i]===void 0?(r[i]={name:i,value:1},r):(r[i].value+=1,r)},{}));return s.length===0?e.jsx(U,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(r=>{var a,i,c,o,m;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(te,{to:`/groups/${r.name}`,children:e.jsx("span",{className:"text-lg",children:r.display_name})}),e.jsx("span",{className:"block",children:r.name})]}),e.jsx("td",{children:((a=r.taxonomy)==null?void 0:a.task)||""}),e.jsx("td",{children:((i=r.taxonomy)==null?void 0:i.what)||""}),e.jsx("td",{children:((c=r.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((o=r.taxonomy)==null?void 0:o.when)||""}),e.jsx("td",{children:((m=r.taxonomy)==null?void 0:m.language)||""}),e.jsx("td",{children:e.jsx(G,{value:r.description})})]})})})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(X,{className:"flex flex-col",children:[e.jsx(ie,{children:"Total scenarios"}),e.jsx(Te,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(X,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(Me,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(Me,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function wt(){return A(`${$()}/groups.json`)}async function _e(s){try{return await(await fetch(wt(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function xe({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function ne({active:s=!1,onClick:t=()=>{},size:n="md",children:r}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:r})}function vt({title:s,titleId:t,...n},r){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),l.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const Nt=l.forwardRef(vt),Ie=Nt;function Q(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Se({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const r=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const a="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(a)}})();return r?e.jsx(te,{to:r,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[Q(s.value),!n&&e.jsx(Ie,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:Q(s.value)}):e.jsx(e.Fragment,{children:Q(s.value)})}return s.href?e.jsx(te,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[Q(s.value),!n&&e.jsx(Ie,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(G,{value:String(s.value)}):t?e.jsx("a",{title:t,children:Q(s.value)}):e.jsx(e.Fragment,{children:Q(s.value)})}function ue({schema:s,groupTable:t,numRowsToDisplay:n,sortColumnIndex:r=1,sortable:a=!0,displayColumnIndexes:i=void 0,miniStyle:c=!1}){const[o,m]=l.useState(1),[p,E]=l.useState(r);function v(h){return h.length>30?h.substring(0,27)+"...":h}const y=h=>{const g=["AIRBench 2024 -","-book"];if(h.value==="Model/adapter")return"Model";if(g.some(d=>h.value.includes(d))){let d=h.value;return g.forEach(b=>{d=d.replace(b,"")}),v(d)}else return v(h.value)},I=h=>{if(s){const g=s.models.find(d=>d.display_name===h);if(g){let d=g.description;return d.includes("/")&&(d=d.replace("/","_")),d}}return""},C=h=>{m(h===p?o*-1:h===0?-1:1),E(h)},u=h=>{if(s){const g=s.models.find(d=>d.display_name===h);if(g){let d=g.name;return d.includes("/")&&(d=d.replace("/","_")),d}}return""},H=()=>{const h=t.header[p].lower_is_better,g=o*(h?1:-1),d=t.rows.slice();return d.sort((b,_)=>{var Z,O;const T=(Z=b[p])==null?void 0:Z.value,S=(O=_[p])==null?void 0:O.value;return T!==void 0&&S===void 0?-1:S!==void 0&&T===void 0?1:typeof T=="number"&&typeof S=="number"?(T-S)*g:typeof T=="string"&&typeof S=="string"?g===1?T.localeCompare(S):S.localeCompare(T):0}),n>0?d.slice(0,n):d};function M(h){const g=h.lastIndexOf(" - ");return g===-1?h:h.substring(0,g)+"*"+h.substring(g+1)}const K=h=>{const d=M(h).split("*")[0].trim();if(s){const b=s.run_groups.find(_=>_.display_name===d||_.short_display_name===d);if(b)return b.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((h,g)=>i===void 0||i.includes(g)).map((h,g)=>e.jsx("th",{className:`${g===p?"bg-gray-100":"bg-white"} ${g===0?"left-0 z-40":""} ${h.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:h.description?h.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:y(h)}),a?e.jsx("button",{className:"link",onClick:()=>C(g),children:e.jsx(at,{className:"w-6 h-6"})}):null]})},`$${g}`))})}),e.jsx("tbody",{children:H().map((h,g)=>e.jsx("tr",{children:h.filter((d,b)=>i===void 0||i.includes(b)).map((d,b)=>e.jsx("td",{className:`${b===0?"z-20 text-lg sticky left-0":"z-0"} ${g%2===0?"bg-gray-50":"bg-white"}`,children:b==1?e.jsx("div",{className:`${d&&d.style&&d.style["font-weight"]&&d.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Se,{value:{...d,href:"/runs/?q="+u(String(h[0].value))},title:`Click value to see all predictions for: ${u(String(h[0].value))}`})}):e.jsx("div",{className:`${d&&d.style&&d.style["font-weight"]&&d.style["font-weight"]==="bold"?"font-bold":""} ${b===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(Se,{value:{...d},title:String(h[0].value)===d.value?I(String(h[0].value)):`Click value to see predictions for ${String(h[0].value)} for ${K(y(t.header[b]))}: ${u(String(h[0].value))}`})})},`${b}`))},`$${h[0].value}`))})]})}function At(){const[s,t]=l.useState(0),[n,r]=l.useState(),[a,i]=l.useState();return l.useEffect(()=>{const c=new AbortController;async function o(){const m=L(c.signal),p=_e(c.signal),E=await m;i(E);const v=await p;r(v)}return o(),()=>c.abort()},[]),n===void 0||a===void 0?e.jsx(U,{}):n.length===0?e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[n.length>1?e.jsx(xe,{children:n.map((c,o)=>e.jsx(ne,{active:o===s,onClick:()=>t(o),children:c.title},o))}):null,e.jsx(ue,{schema:a,groupTable:n[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function ze(s,t){try{return await(await fetch(A(`${$()}/groups/${s}.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function Ve({schema:s,runGroupName:t,numRowsToDisplay:n=-1}){const[r,a]=l.useState(),[i,c]=l.useState(0);return l.useEffect(()=>{const o=new AbortController;async function m(){const p=await ze(t,o.signal);a(p)}return m(),()=>o.abort()},[s,t]),r===void 0||r.length===0?e.jsx(U,{}):r.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[r.length>1?e.jsx(xe,{children:r.map((o,m)=>e.jsx(ne,{active:m===i,onClick:()=>c(m),children:o.title},m))}):null,e.jsx(ue,{schema:s,groupTable:r[i],numRowsToDisplay:n,sortColumnIndex:1},`${t}-${i}`)]})}function Et(){const{groupName:s}=ke(),[t,n]=l.useState(void 0);l.useEffect(()=>{const i=new AbortController;async function c(){const m=await L(i.signal);n(m)}return c(),()=>i.abort()},[]);const a=(()=>{if(t!==void 0){for(const i of t.run_groups)if(i.name===s)return i}})();return t===void 0?e.jsx(U,{}):a===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:a.display_name,subtitle:a.description,markdown:!0,className:"mr-8"}),e.jsx(Ve,{schema:t,runGroupName:a.name},a.name)]})}async function We(s){try{return await(await fetch(A(`${$()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function oe({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:r,className:a}){let i="join";return a!==void 0&&(i=`join ${a}`),e.jsxs("div",{className:i,children:[e.jsx("button",{onClick:r,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const le=100;function yt(){const[s,t]=Ce(),[n,r]=l.useState(),[a,i]=l.useState(Number(s.get("page")||1)),[c,o]=l.useState(!0),[m,p]=l.useState(s.get("q")||"");l.useEffect(()=>{const u=new AbortController;async function H(){const M=await We(u.signal);r(M)}return H(),()=>u.abort()},[]);const E=u=>{u.preventDefault();const M=u.target.q.value;p(M),t({q:M,page:"1"})};if(n===void 0)return e.jsx(U,{});const v=c?new RegExp(m):null,y=n.filter(u=>v?v.test(u.name):u.name.includes(m)),I=y.slice((a-1)*le,a*le),C=Math.ceil(y.length/le);return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:E,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:m,onChange:u=>p(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>o(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${y.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ct,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:I.map((u,H)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(te,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${H}`))})]})}),C>0?e.jsx(oe,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(a+1,C);i(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(a-1,1);i(u),s.set("page",String(u)),t(s)},currentPage:a,totalPages:C}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function V(){return window.SUITE!==void 0?window.SUITE:void 0}async function Mt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/instances.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}async function Rt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/stats.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}async function It(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/display_requests.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}async function St(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/display_predictions.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name==="AbortError"&&console.log(r),[]}}async function Lt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/scenario.json`),{signal:t})).json()}catch(r){r instanceof Error&&r.name!=="AbortError"&&console.log(r);return}}function kt(s,t){return A(`/runs/${t||V()}/${s}/run_spec.json`)}function Ct(s,t){return A(`/runs/${t||V()}/${s}/scenario_state.json`)}function Tt(s){const n={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},r=Object.keys(s);for(const a of r)if(s[a]!==void 0&&n[a]!==void 0)return n[a]?s[a]<.5?[a,!0]:[a,!1]:s[a]>=.5?[a,!0]:[a,!1];return["",!1]}function Pt(s){const[t,n]=Tt(s.stats);return t===""?null:n?e.jsx(Bt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(Dt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function Bt({value:s}){return e.jsx(Y,{icon:zs,color:"green",children:s})}function Dt({value:s}){return e.jsx(Y,{icon:qs,color:"red",children:s})}function z({value:s}){const[t,n]=l.useState(!1),[r,a]=l.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>a(!0),children:e.jsx(et,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:r,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>a(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function qe({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=A(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Qe({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(qe,{mediaObject:t}))})}function Ut(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Ht({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(z,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Qe,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(de,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(me,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:Ut(s.request[t])}):"null"]},n+1))})]})}function Ot(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(z,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(z,{value:t.text})," "]}),t.media_object&&e.jsx(qe,{mediaObject:t.media_object})]},n))})}function Ft(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(z,{value:n.toString()})]}))})}function _t({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(n)?Ot(n):Ft(n)})]},t)):null})}function zt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((r,a)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",r.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:r.base64_images&&r.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),r.base64_images.map(i=>e.jsx("img",{src:"data:image;base64,"+i,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Pt,{stats:r.stats})]}),e.jsx(z,{value:r.predicted_text}),r.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(z,{value:String(r.mapped_output)})]}):null]})}),e.jsx(_t,{predictionAnnotations:r.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(de,{children:Object.keys(r.stats).map((i,c)=>e.jsxs(me,{children:[n[i]?e.jsx("span",{title:n[i].description,children:n[i].display_name}):e.jsx("span",{children:i}),e.jsx("span",{children:String(r.stats[i])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Ht,{request:t[a]})})]})]},a))})})}const Vt="correct";function Wt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(r=>e.jsx(Y,{className:"mx-2",color:r===Vt?"green":void 0,children:r}))]},n))})]})}function qt({instance:s,requests:t,predictions:n,metricFieldMap:r}){const a=i=>i.perturbation===void 0?`Instance id: ${i.id} [split: ${i.split}]`:`Instance id: ${i.id} [split: ${i.split}][perturbation: ${i.perturbation.name}]`;return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:a(s)}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Qe,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(z,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Wt,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(zt,{predictions:n,requests:t,metricFieldMap:r}):null})]})}function Qt({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}function Ge(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Gt(s){try{return await(await fetch(A(`/releases/${Ge()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function Kt(s,t){return Ge()?s[t]:window.SUITE}const ee=10,se=50;function Jt(){const{runName:s}=ke(),[t,n]=Ce(),[r,a]=l.useState(0),[i,c]=l.useState(),[o,m]=l.useState(),[p,E]=l.useState([]),[v,y]=l.useState([]),[I,C]=l.useState(),[u,H]=l.useState(),[M,K]=l.useState(1),[h,g]=l.useState(1),[d,b]=l.useState(1),[_,T]=l.useState(1),[S,Z]=l.useState(),[O,xs]=l.useState(),[pe,us]=l.useState({}),[je,fs]=l.useState({}),[be,gs]=l.useState("");if(l.useEffect(()=>{const x=new AbortController;async function F(){const N=x.signal;if(s===void 0)return()=>x.abort();const P=window.SUITE?window.SUITE:Kt(await Gt(N),s);m(P);const[we,ve,Ne,bs,ws,vs,ae]=await Promise.all([We(N),Mt(s,N,P),Rt(s,N,P),Lt(s,N,P),St(s,N,P),It(s,N,P),L(N)]);c(we.find(w=>w.name===s)),E(ve);const Ae=Math.ceil(ve.length/ee),Ns=Number(t.get("instancesPage")||1);g(Ae),K(Math.max(Math.min(Ns,Ae),1)),y(Ne),xs(bs);const Ee=Math.floor(Ne.length/se),As=Number(t.get("metricsPage")||1);T(Ee),b(Math.max(Math.min(As,Ee),1));const W={};vs.forEach(w=>{var J;const j=w.instance_id,B=((J=w.perturbation)==null?void 0:J.name)||"";W[j]===void 0&&(W[j]={}),W[j][B]===void 0&&(W[j][B]=[]),W[j][B].push(w)}),H(W);const q={};ws.forEach(w=>{var J;const j=w.instance_id,B=((J=w.perturbation)==null?void 0:J.name)||"";q[j]===void 0&&(q[j]={}),q[j][B]===void 0&&(q[j][B]=[]),q[j][B].push(w)}),C(q),fs(ae.metrics.reduce((w,j)=>(w[j.name]=j,w),{})),us(ae.adapter.reduce((w,j)=>(w[j.name]=j,w),{})),Z(ae.models.find(w=>{var j;return w.name===((j=we.find(B=>B.name===s))==null?void 0:j.adapter_spec.model)}))}return F(),()=>x.abort()},[s,t]),i===void 0||I===void 0||u===void 0||O===void 0)return e.jsx(U,{});const ps=p.slice((M-1)*ee,(M-1)*ee+ee),js=v.slice((d-1)*se,(d-1)*se+se);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[O.name,e.jsx("a",{href:"/#/groups/"+O.name,children:e.jsx(Ys,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(G,{value:O.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:i.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(G,{value:(S==null?void 0:S.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:O.tags.map(x=>e.jsx(Y,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:x})}))})]})}),e.jsxs(X,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Ks,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:kt(i.name,o),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Ct(i.name,o),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(de,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(i.adapter_spec).map(([x,F],N)=>e.jsxs(me,{className:N<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:pe[x]?pe[x].description:void 0,children:`${x}: `}),e.jsx("span",{className:"overflow-x-auto",children:F})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(xe,{children:[e.jsx(ne,{size:"lg",active:r===0,onClick:()=>a(0),children:"Instances + Predictions"}),e.jsx(ne,{size:"lg",active:r===1,onClick:()=>a(1),children:"All metrics"})]})}),r===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:ps.map((x,F)=>{var N,P;return e.jsx(qt,{instance:x,requests:u[x.id][((N=x.perturbation)==null?void 0:N.name)||""],predictions:I[x.id][((P=x.perturbation)==null?void 0:P.name)||""],metricFieldMap:je},`${x.id}-${F}`)})}),e.jsx(oe,{className:"flex justify-center my-8",onNextPage:()=>{const x=Math.min(M+1,h);K(x),t.set("instancesPage",String(x)),n(t)},onPrevPage:()=>{const x=Math.max(M-1,1);K(x),t.set("instancesPage",String(x)),n(t)},currentPage:M,totalPages:h})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:x=>gs(x.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(v[0]).map(x=>e.jsx("th",{children:x},x))})}),e.jsx("tbody",{children:js.filter(x=>!be||x.name.name.toLowerCase().includes(be.toLowerCase())).map(x=>e.jsx("tr",{children:Object.entries(x).map(([F,N])=>F==="name"?e.jsx("td",{children:e.jsx(Qt,{stat:x,metricFieldMap:je})},F):e.jsx("td",{children:N}))}))})]})}),e.jsx(oe,{className:"flex justify-center my-8",onNextPage:()=>{const x=Math.min(d+1,_);b(x),t.set("metricsPage",String(x)),n(t)},onPrevPage:()=>{const x=Math.max(d-1,1);b(x),t.set("metricsPage",String(x)),n(t)},currentPage:d,totalPages:_})]})]})}function Xt(){const[s,t]=l.useState(void 0),[n,r]=l.useState(void 0),[a,i]=l.useState(void 0);if(l.useEffect(()=>{const o=new AbortController;async function m(){const p=L(o.signal),E=_e(o.signal),v=await p;t(v);const y=await E,I=[];y.forEach(C=>{C.rows.forEach(u=>{I.push({title:String(u[0].value),name:u[0].href.replace("?group=","")})})}),r(I)}return m(),()=>o.abort()},[]),s===void 0||n===void 0)return e.jsx(U,{});if(n.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=a!==void 0?a:n[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:o=>i(o.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:n.map((o,m)=>e.jsx("option",{value:o.name,children:o.title},m))})]})]}),e.jsx(Ve,{schema:s,runGroupName:c},c)]})}const Yt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,$t=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Yt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:$t,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function fe({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(f,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function ge({runGroups:s}){const t=new Map(s.filter(a=>a.metric_groups!==void 0&&(a.subgroups===void 0||a.subgroups.length===0)).map(a=>[a.name,a])),n=new Set,r=[];return s.forEach(a=>{const i=a.subgroups?a.subgroups:[],c=[];i.forEach(o=>{const m=t.get(o);m&&(c.push(m),n.add(m.name))}),c.length>0&&r.push([a,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:r.map(([a,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx(f,{className:"text-black",to:"groups/"+a.name,children:e.jsx("h2",{children:a.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(f,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},a.name))})]})}const Ke=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function R({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:n=10,sortColumnIndex:r=1}){const[a,i]=l.useState(void 0),[c,o]=l.useState(void 0);return l.useEffect(()=>{const m=new AbortController;async function p(){const E=await L(m.signal);i(E);const v=E.run_groups;if(v.length===0)return;const y=s||v[0].name,I=await ze(y,m.signal);o(I[t])}return p(),()=>m.abort()},[s,t]),a===void 0||c===void 0?e.jsx(U,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(ue,{schema:a,groupTable:c,numRowsToDisplay:n,sortColumnIndex:r,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function en(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ke,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Je=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Xe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,Ye=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,$e=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,Ze=""+new URL("cohere-3550c6cb.png",import.meta.url).href,es=""+new URL("eleutherai-b9451114.png",import.meta.url).href,ss=""+new URL("google-06d997ad.png",import.meta.url).href,ts=""+new URL("meta-5580e9f1.png",import.meta.url).href,ns=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,rs=""+new URL("mistral-18e1be23.png",import.meta.url).href,as=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,ls=""+new URL("openai-3f8653e4.png",import.meta.url).href,is=""+new URL("tii-24de195c.png",import.meta.url).href,cs=""+new URL("together-a665a35b.png",import.meta.url).href,os=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,ds="",ms=""+new URL("yandex-38e09d70.png",import.meta.url).href,hs=""+new URL("01-694cb9b7.png",import.meta.url).href,sn=[Je,Xe,Ye,$e,Ze,es,ss,ts,ns,rs,as,ls,is,cs,os,ds,ms,hs];function Le(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function r(){const a=await L(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(en,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:sn.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(fe,{models:s.models}),e.jsx(ge,{runGroups:s.run_groups})]})})]})]}):null}function tn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(R,{})})]})]})}const nn=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function rn(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:nn,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const an=""+new URL("scb10x-204bd786.png",import.meta.url).href,ln=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function cn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:ln,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:an,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const on=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function dn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:on,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const mn=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function hn({metricFieldMap:s,metricGroups:t}){const n=new Set,r=[];return t.forEach(a=>{const i=[];a.metrics.forEach(c=>{const o=s[c.name];o&&(i.push(o),n.add(o.name))}),i.length>0&&r.push([a,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:r.map(([a,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:a.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},a.name))})]})}function xn(){const[s,t]=l.useState(void 0);l.useEffect(()=>{const r=new AbortController;async function a(){const i=await L(r.signal);t(i)}return a(),()=>r.abort()},[]);const n=s?s.metrics.reduce((r,a)=>(r[a.name]=a,r),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:mn,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(fe,{models:s.models}),e.jsx(ge,{runGroups:s.run_groups}),e.jsx(hn,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const un=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,fn=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function gn(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function r(){const a=await L(n.signal);t(a)}return r(),()=>n.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 6 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:un,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:""}),e.jsx("img",{src:fn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(R,{}),e.jsx(f,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(fe,{models:s.models}),e.jsx(ge,{runGroups:s.run_groups})]})]})}const pn=""+new URL("accenture-6f97eeda.png",import.meta.url).href,jn=""+new URL("cresta-9e22b983.png",import.meta.url).href;function bn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:pn,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:jn,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const wn=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function vn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:wn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Nn="";function An(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:Nn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const En=({id:s,title:t,text:n})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:he(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function yn(){const[s,t]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((n,r)=>n.id==="home"?null:e.jsx(En,{id:n.id,title:n.title,text:n.description},r))})})}function Mn(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
10
+ mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ke,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const Rn=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,In=[Je,Rn,Xe,Ye,$e,Ze,es,ss,ts,ns,rs,as,ls,is,cs,os,ds,ms,hs];function Sn(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function r(){const a=await L(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Mn,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(yn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:In.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]})})]}):null}const Ln=""+new URL("overview-74aea3d8.png",import.meta.url).href,kn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function Cn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"TODO",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:Ln,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms form ArXiV papers"}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript, ..."}),e.jsx("li",{children:"Music sheets: crops of measures from music sheets from IMSLP"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(R,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:kn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function Tn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Pn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(R,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Bn(){return window.PROJECT_ID==="lite"?e.jsx(Le,{}):window.PROJECT_ID==="instruct"?e.jsx(Zt,{}):window.PROJECT_ID==="image2struct"?e.jsx(Cn,{}):window.PROJECT_ID==="heim"?e.jsx(xn,{}):window.PROJECT_ID==="mmlu"?e.jsx(tn,{}):window.PROJECT_ID==="vhelm"?e.jsx(gn,{}):window.PROJECT_ID==="air-bench"?e.jsx(rn,{}):window.PROJECT_ID==="thaiexam"?e.jsx(cn,{}):window.PROJECT_ID==="finance"?e.jsx(dn,{}):window.PROJECT_ID==="call-center"?e.jsx(bn,{}):window.PROJECT_ID==="cleva"?e.jsx(vn,{}):window.PROJECT_ID==="tables"?e.jsx(An,{}):window.PROJECT_ID==="ewok"?e.jsx(Tn,{}):window.PROJECT_ID==="medical"?e.jsx(Pn,{}):window.PROJECT_ID==="home"?e.jsx(Sn,{}):e.jsx(Le,{})}function Dn(){return e.jsx(Rs,{children:e.jsx(Is,{children:e.jsxs(D,{path:"/",element:e.jsx(xt,{}),children:[e.jsx(D,{index:!0,element:e.jsx(Bn,{})}),e.jsx(D,{path:"leaderboard",element:e.jsx(Xt,{})}),e.jsx(D,{path:"models",element:e.jsx(jt,{})}),e.jsx(D,{path:"scenarios",element:e.jsx(bt,{})}),e.jsx(D,{path:"groups",element:e.jsx(At,{})}),e.jsx(D,{path:"groups/:groupName",element:e.jsx(Et,{})}),e.jsx(D,{path:"runs",element:e.jsx(yt,{})}),e.jsx(D,{path:"runs/:runName",element:e.jsx(Jt,{})})]})})})}ce.createRoot(document.getElementById("root")).render(e.jsx(Ss.StrictMode,{children:e.jsx(Dn,{})}));
@@ -7,11 +7,11 @@
7
7
  <title>Holistic Evaluation of Language Models (HELM)</title>
8
8
  <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
9
9
  <script type="text/javascript" src="./config.js"></script>
10
- <script type="module" crossorigin src="./assets/index-737eef9e.js"></script>
10
+ <script type="module" crossorigin src="./assets/index-58f97dcd.js"></script>
11
11
  <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
12
12
  <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
13
13
  <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
14
- <link rel="stylesheet" href="./assets/index-878a1094.css">
14
+ <link rel="stylesheet" href="./assets/index-05c76bb1.css">
15
15
  </head>
16
16
  <body class="block">
17
17
  <div id="root"></div>
@@ -11,13 +11,13 @@ class TestOpenAIWindowService:
11
11
  def setup_method(self):
12
12
  self.path: str = tempfile.mkdtemp()
13
13
  service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
14
- self.window_service = WindowServiceFactory.get_window_service("openai/davinci", service)
14
+ self.window_service = WindowServiceFactory.get_window_service("huggingface/gpt2", service)
15
15
 
16
16
  def teardown_method(self, method):
17
17
  shutil.rmtree(self.path)
18
18
 
19
19
  def test_max_request_length(self):
20
- assert self.window_service.max_request_length == 2049
20
+ assert self.window_service.max_request_length == 1025
21
21
 
22
22
  def test_encode(self):
23
23
  assert self.window_service.encode(TEST_PROMPT).token_values == GPT2_TEST_TOKEN_IDS
@@ -30,19 +30,19 @@ class TestOpenAIWindowService:
30
30
 
31
31
  def test_fits_within_context_window(self):
32
32
  # Should fit in the context window since we subtracted the number of tokens of the test prompt
33
- # from the max request length of 2049
34
- assert self.window_service.fits_within_context_window(TEST_PROMPT, 2049 - 51)
33
+ # from the max request length of 1025
34
+ assert self.window_service.fits_within_context_window(TEST_PROMPT, 1025 - 51)
35
35
  # Should not fit within the max request length because we're expecting one more extra token in the completion
36
- assert not self.window_service.fits_within_context_window(TEST_PROMPT, 2049 - 51 + 1)
36
+ assert not self.window_service.fits_within_context_window(TEST_PROMPT, 1025 - 51 + 1)
37
37
 
38
38
  def test_truncate_from_right(self):
39
- # Create a prompt that exceed max context length: 51 * 41 = 2091 tokens
40
- long_prompt: str = TEST_PROMPT * 41
39
+ # Create a prompt that exceed max context length: 51 * 21 = 1071 tokens
40
+ long_prompt: str = TEST_PROMPT * 21
41
41
  assert not self.window_service.fits_within_context_window(long_prompt)
42
42
 
43
43
  # Truncate and ensure it fits within the context window
44
44
  truncated_long_prompt: str = self.window_service.truncate_from_right(long_prompt)
45
- assert self.window_service.get_num_tokens(truncated_long_prompt) == 2049
45
+ assert self.window_service.get_num_tokens(truncated_long_prompt) == 1025
46
46
  assert self.window_service.fits_within_context_window(truncated_long_prompt)
47
47
 
48
48
  def test_tokenize_and_count(self):
@@ -1,7 +1,8 @@
1
- from typing import Dict, Optional
1
+ from typing import Dict, List, Optional, TypedDict
2
2
  import requests
3
3
 
4
4
  from helm.common.cache import CacheConfig
5
+ from helm.common.optional_dependencies import handle_module_not_found_error
5
6
  from helm.common.request import (
6
7
  wrap_request_time,
7
8
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
@@ -13,6 +14,12 @@ from helm.common.request import (
13
14
  from .client import CachingClient, truncate_sequence, cleanup_str
14
15
  from .ai21_utils import AI21RequestError, handle_failed_request
15
16
 
17
+ try:
18
+ from ai21 import AI21Client as AISDKClient
19
+ from ai21.models.chat import ChatMessage as SDKChatMessage, ChatCompletionResponse
20
+ except ModuleNotFoundError as e:
21
+ handle_module_not_found_error(e, ["ai21"])
22
+
16
23
 
17
24
  class AI21Client(CachingClient):
18
25
  """
@@ -126,3 +133,66 @@ class AI21Client(CachingClient):
126
133
  completions=completions,
127
134
  embedding=[],
128
135
  )
136
+
137
+
138
+ class AI21ChatRequest(TypedDict):
139
+ """Data passed between make_request and _send_request. Used as the cache key."""
140
+
141
+ model: str
142
+ messages: List[Dict[str, str]]
143
+ max_tokens: int
144
+ temperature: float
145
+ stop: List[str]
146
+ n: int
147
+ top_p: float
148
+
149
+
150
+ class AI21ChatClient(CachingClient):
151
+ def __init__(self, api_key: str, cache_config: CacheConfig):
152
+ super().__init__(cache_config=cache_config)
153
+ self.client = AISDKClient(api_key=api_key)
154
+
155
+ def make_request(self, request: Request) -> RequestResult:
156
+ if request.embedding:
157
+ return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
158
+ # TODO: Support messages
159
+ assert not request.messages, "AI21ChatClient currently does not support the messages API"
160
+
161
+ raw_request: AI21ChatRequest = {
162
+ "model": request.model_engine,
163
+ "messages": [{"role": "user", "content": request.prompt}],
164
+ "max_tokens": request.max_tokens,
165
+ "temperature": request.temperature,
166
+ "stop": request.stop_sequences,
167
+ "n": request.num_completions,
168
+ "top_p": request.top_p,
169
+ }
170
+
171
+ def do_it():
172
+ chat_completion_response: ChatCompletionResponse = self.client.chat.completions.create(
173
+ model=raw_request["model"],
174
+ messages=[SDKChatMessage.from_dict(m) for m in raw_request["messages"]],
175
+ max_tokens=raw_request["max_tokens"],
176
+ temperature=raw_request["temperature"],
177
+ stop=raw_request["stop"],
178
+ n=raw_request["n"],
179
+ top_p=raw_request["top_p"],
180
+ )
181
+ return chat_completion_response.to_dict()
182
+
183
+ cache_key = CachingClient.make_cache_key(raw_request, request)
184
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
185
+
186
+ completions: List[GeneratedOutput] = []
187
+
188
+ for choice in response["choices"]:
189
+ completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
190
+
191
+ return RequestResult(
192
+ success=True,
193
+ cached=cached,
194
+ request_time=response["request_time"],
195
+ request_datetime=response["request_datetime"],
196
+ completions=completions,
197
+ embedding=[],
198
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import Any, Dict, List, Optional, TypedDict, Union, cast
2
2
  import json
3
+ import os
3
4
  import requests
4
5
  import tempfile
5
6
  import time
@@ -244,6 +245,8 @@ class AnthropicMessagesClient(CachingClient):
244
245
  # Source: https://docs.anthropic.com/claude/docs/models-overview
245
246
  MAX_OUTPUT_TOKENS: int = 4096
246
247
 
248
+ MAX_IMAGE_SIZE_BYTES: int = 5242880 # 5MB
249
+
247
250
  def __init__(
248
251
  self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
249
252
  ):
@@ -263,31 +266,25 @@ class AnthropicMessagesClient(CachingClient):
263
266
  system_message: Optional[MessageParam] = None
264
267
 
265
268
  if request.messages is not None:
266
- # TODO(#2439): Refactor out Request validation
267
- if request.multimodal_prompt is not None or request.prompt:
268
- raise AnthropicMessagesRequestError(
269
- "Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
270
- )
269
+ request.validate()
271
270
  messages = cast(List[MessageParam], request.messages)
272
271
  if messages[0]["role"] == "system":
273
272
  system_message = messages[0]
274
273
  messages = messages[1:]
275
274
 
276
275
  elif request.multimodal_prompt is not None:
277
- # TODO(#2439): Refactor out Request validation
278
- if request.messages is not None or request.prompt:
279
- raise AnthropicMessagesRequestError(
280
- "Exactly one of Request.messages, Request.prompt or Request.multimodal_prompt should be set"
281
- )
276
+ request.validate()
282
277
  blocks: List[Union[TextBlockParam, ImageBlockParam]] = []
283
278
  for media_object in request.multimodal_prompt.media_objects:
284
279
  if media_object.is_type(IMAGE_TYPE):
285
- # TODO(#2439): Refactor out Request validation
286
- if not media_object.location:
287
- raise Exception("MediaObject of image type has missing location field value")
288
-
289
- from helm.common.images_utils import encode_base64, get_dimensions, copy_image
280
+ from helm.common.images_utils import (
281
+ encode_base64,
282
+ get_dimensions,
283
+ copy_image,
284
+ resize_image_to_max_file_size,
285
+ )
290
286
 
287
+ assert media_object.location
291
288
  image_location: str = media_object.location
292
289
  base64_image: str
293
290
 
@@ -310,6 +307,21 @@ class AnthropicMessagesClient(CachingClient):
310
307
  height=min(image_height, AnthropicClient.MAX_IMAGE_DIMENSION),
311
308
  )
312
309
  base64_image = encode_base64(temp_file.name, format="JPEG")
310
+
311
+ elif os.path.getsize(image_location) > AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES:
312
+ hlog(
313
+ f"WARNING: Image {image_location} exceeds max allowed size: "
314
+ f"{AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES} bytes"
315
+ )
316
+ # Resize the image so it is smaller than the max allowed size
317
+ with tempfile.NamedTemporaryFile(suffix=".jpg") as temp_file:
318
+ hlog(f"Resizing image to temporary path: {temp_file.name}")
319
+ resize_image_to_max_file_size(
320
+ src=image_location,
321
+ dest=temp_file.name,
322
+ max_size_in_bytes=AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES,
323
+ )
324
+ base64_image = encode_base64(temp_file.name, format="JPEG")
313
325
  else:
314
326
  base64_image = encode_base64(image_location, format="JPEG")
315
327
 
@@ -323,16 +335,15 @@ class AnthropicMessagesClient(CachingClient):
323
335
  }
324
336
  blocks.append(image_block)
325
337
  if media_object.is_type(TEXT_TYPE):
326
- # TODO(#2439): Refactor out Request validation
327
- if media_object.text is None:
328
- raise ValueError("MediaObject of text type has missing text field value")
338
+ assert media_object.text
329
339
  text_block: TextBlockParam = {
330
340
  "type": "text",
331
341
  "text": media_object.text,
332
342
  }
333
343
  # Anthropic does not support empty text blocks
334
- if media_object.text.strip():
335
- blocks.append(text_block)
344
+ if media_object.text:
345
+ if media_object.text.strip():
346
+ blocks.append(text_block)
336
347
  messages = [{"role": "user", "content": blocks}]
337
348
 
338
349
  else:
@@ -368,14 +379,25 @@ class AnthropicMessagesClient(CachingClient):
368
379
  return response
369
380
  raise
370
381
 
371
- cache_key = CachingClient.make_cache_key(
372
- {
373
- "completion_index": completion_index,
374
- **raw_request,
375
- },
376
- request,
377
- )
378
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
382
+ try:
383
+ cache_key = CachingClient.make_cache_key(
384
+ {
385
+ "completion_index": completion_index,
386
+ **raw_request,
387
+ },
388
+ request,
389
+ )
390
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
391
+ except AnthropicMessagesResponseError:
392
+ hlog("WARNING: Response has empty content")
393
+ return RequestResult(
394
+ success=False,
395
+ cached=False,
396
+ error="Anthropic response has empty content",
397
+ completions=[],
398
+ embedding=[],
399
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
400
+ )
379
401
 
380
402
  if _is_content_moderation_failure(response):
381
403
  hlog(
@@ -5,6 +5,7 @@ from typing import Any, Dict, Mapping, Optional
5
5
  from retrying import Attempt, RetryError
6
6
 
7
7
  from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
8
+ from helm.benchmark.tokenizer_config_registry import get_tokenizer_config
8
9
  from helm.common.file_caches.file_cache import FileCache
9
10
  from helm.common.file_caches.local_file_cache import LocalFileCache
10
11
  from helm.common.credentials_utils import provide_api_key
@@ -88,6 +89,10 @@ class AutoClient(Client):
88
89
  "location": lambda: self.credentials.get(host_organization + "Location", None), # VertexAI
89
90
  "hf_auth_token": lambda: self.credentials.get("huggingfaceAuthToken", None), # HuggingFace
90
91
  "file_cache": lambda: self._get_file_cache(host_organization), # Text-to-image models
92
+ "endpoint": lambda: self.credentials.get(host_organization + "Endpoint", None), # Palmyra
93
+ "end_of_text_token": lambda: self._get_end_of_text_token(
94
+ tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
95
+ ),
91
96
  },
92
97
  )
93
98
  client = create_object(client_spec)
@@ -213,3 +218,9 @@ class AutoClient(Client):
213
218
  # Initialize `FileCache` for text-to-image model APIs
214
219
  local_file_cache_path: str = os.path.join(self.file_storage_path, "output", host_organization)
215
220
  return LocalFileCache(local_file_cache_path, file_extension="png")
221
+
222
+ def _get_end_of_text_token(self, tokenizer_name: str) -> Optional[str]:
223
+ tokenizer_config = get_tokenizer_config(tokenizer_name)
224
+ if tokenizer_config is None:
225
+ raise ValueError(f"Could not find tokenizer_config for tokenizer {tokenizer_name}")
226
+ return tokenizer_config.end_of_text_token