crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show
  1. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
  2. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  5. helm/benchmark/annotation/call_center_annotator.py +22 -11
  6. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  7. helm/benchmark/annotation/live_qa_annotator.py +9 -4
  8. helm/benchmark/annotation/medication_qa_annotator.py +9 -4
  9. helm/benchmark/annotation/model_as_judge.py +70 -19
  10. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  11. helm/benchmark/annotation/xstest_annotator.py +20 -30
  12. helm/benchmark/metrics/safety_metrics.py +39 -17
  13. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  14. helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
  15. helm/benchmark/presentation/create_plots.py +1 -1
  16. helm/benchmark/presentation/schema.py +3 -0
  17. helm/benchmark/presentation/summarize.py +106 -256
  18. helm/benchmark/presentation/test_summarize.py +145 -3
  19. helm/benchmark/run_expander.py +27 -0
  20. helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
  21. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  22. helm/benchmark/run_specs/vlm_run_specs.py +8 -3
  23. helm/benchmark/scenarios/bhasa_scenario.py +226 -82
  24. helm/benchmark/scenarios/raft_scenario.py +1 -1
  25. helm/benchmark/static/schema_bhasa.yaml +10 -10
  26. helm/benchmark/static/schema_legal.yaml +566 -0
  27. helm/benchmark/static/schema_safety.yaml +25 -6
  28. helm/benchmark/static/schema_tables.yaml +26 -2
  29. helm/benchmark/static/schema_vhelm.yaml +42 -11
  30. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  31. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  32. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  33. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  34. helm/benchmark/static_build/index.html +1 -1
  35. helm/benchmark/window_services/tokenizer_service.py +0 -5
  36. helm/clients/openai_client.py +16 -1
  37. helm/clients/palmyra_client.py +1 -2
  38. helm/clients/together_client.py +22 -0
  39. helm/common/cache.py +8 -30
  40. helm/common/key_value_store.py +9 -9
  41. helm/common/mongo_key_value_store.py +3 -3
  42. helm/common/test_cache.py +1 -48
  43. helm/common/tokenization_request.py +0 -9
  44. helm/config/model_deployments.yaml +135 -3
  45. helm/config/model_metadata.yaml +134 -6
  46. helm/config/tokenizer_configs.yaml +24 -0
  47. helm/proxy/server.py +0 -9
  48. helm/proxy/services/remote_service.py +0 -6
  49. helm/proxy/services/server_service.py +5 -18
  50. helm/proxy/services/service.py +0 -6
  51. helm/benchmark/data_overlap/__init__.py +0 -0
  52. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  53. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  54. helm/benchmark/data_overlap/light_scenario.py +0 -60
  55. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  56. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  57. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  58. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  59. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  60. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
@@ -76,7 +76,7 @@ metrics:
76
76
  lower_is_better: false
77
77
  - name: safety_score
78
78
  display_name: LM Evaluated Safety score
79
- description: LM Evaluated Safety score (mean of Claude Sonnet3.5 and GPT4o ratings)
79
+ description: LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).
80
80
  lower_is_better: false
81
81
  - name: bbq_accuracy
82
82
  display_name: BBQ accuracy
@@ -98,6 +98,13 @@ metrics:
98
98
  lower_is_better: true
99
99
  description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
100
100
 
101
+ # Annotation metrics
102
+ - name: safety_llama_annotator_success
103
+ display_name: Llama Annotator Success Rate
104
+ description: Fraction of annotator requests to Llama that succeeded.
105
+ - name: safety_gpt_annotator_success
106
+ display_name: GPT Annotator Success Rate
107
+ description: Fraction of annotator requests to GPT that succeeded.
101
108
 
102
109
  ############################################################
103
110
  perturbations: []
@@ -106,6 +113,9 @@ perturbations: []
106
113
  metric_groups:
107
114
  - name: accuracy
108
115
  display_name: Accuracy
116
+ aggregation_strategies:
117
+ - win_rate
118
+ - mean
109
119
  metrics:
110
120
  - name: ${main_name}
111
121
  split: ${main_split}
@@ -131,6 +141,15 @@ metric_groups:
131
141
  - name: num_output_tokens
132
142
  split: ${main_split}
133
143
 
144
+ - name: annotation_metrics
145
+ display_name: Annotation
146
+ hide_win_rates: true
147
+ metrics:
148
+ - name: safety_llama_annotator_success
149
+ split: ${main_split}
150
+ - name: safety_gpt_annotator_success
151
+ split: ${main_split}
152
+
134
153
  - name: toxicity
135
154
  display_name: Toxicity
136
155
  metrics:
@@ -164,8 +183,8 @@ run_groups:
164
183
  description: HarmBench
165
184
  metric_groups:
166
185
  - accuracy
167
- - efficiency
168
186
  - general_information
187
+ - annotation_metrics
169
188
  environment:
170
189
  main_name: safety_score
171
190
  main_split: test
@@ -181,8 +200,8 @@ run_groups:
181
200
  description: SimpleSafetyTests
182
201
  metric_groups:
183
202
  - accuracy
184
- - efficiency
185
203
  - general_information
204
+ - annotation_metrics
186
205
  environment:
187
206
  main_name: safety_score
188
207
  main_split: test
@@ -198,8 +217,8 @@ run_groups:
198
217
  description: XSTest
199
218
  metric_groups:
200
219
  - accuracy
201
- - efficiency
202
220
  - general_information
221
+ - annotation_metrics
203
222
  environment:
204
223
  main_name: safety_score
205
224
  main_split: test
@@ -215,9 +234,9 @@ run_groups:
215
234
  description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
216
235
  metric_groups:
217
236
  - accuracy
218
- - efficiency
219
237
  - general_information
220
238
  - bbq_metrics
239
+ - annotation_metrics
221
240
  environment:
222
241
  main_name: bbq_accuracy
223
242
  main_split: test
@@ -234,8 +253,8 @@ run_groups:
234
253
  description: Anthropic Red Team
235
254
  metric_groups:
236
255
  - accuracy
237
- - efficiency
238
256
  - general_information
257
+ - annotation_metrics
239
258
  environment:
240
259
  main_name: safety_score
241
260
  main_split: test
@@ -173,6 +173,13 @@ metrics:
173
173
  description: Execution Accuracy
174
174
  lower_is_better: false
175
175
 
176
+ # SciGen Accuracy
177
+ - name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
178
+ display_name: Rating
179
+ short_display_name: Rating
180
+ description: Rating by Llama 3 (8B) LLM as judge
181
+ lower_is_better: false
182
+
176
183
  perturbations: []
177
184
 
178
185
  metric_groups:
@@ -233,14 +240,14 @@ metric_groups:
233
240
 
234
241
  run_groups:
235
242
  - name: table_scenarios
236
- display_name: Table Scenarios
243
+ display_name: Table Scenarios
237
244
  description: Table Scenarios
238
245
  category: All Scenarios
239
246
  subgroups:
240
247
  - unitxt_cards.numeric_nlg
241
248
  - unitxt_cards.tab_fact
242
249
  - unitxt_cards.wikitq
243
- - unitxt_cards.fin_qa
250
+ - unitxt_cards.scigen
244
251
 
245
252
  - name: unitxt_cards.numeric_nlg
246
253
  display_name: NumericNLG
@@ -315,3 +322,20 @@ run_groups:
315
322
  who: financial experts
316
323
  when: 1999 to 2019
317
324
  language: English
325
+
326
+ - name: unitxt_cards.scigen
327
+ display_name: SciGen
328
+ description: SciGen
329
+ metric_groups:
330
+ - main_metrics
331
+ - efficiency
332
+ - general_information
333
+ environment:
334
+ main_name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
335
+ main_split: test
336
+ taxonomy:
337
+ task: "?"
338
+ what: "?"
339
+ who: "?"
340
+ when: "?"
341
+ language: English
@@ -282,6 +282,7 @@ run_groups:
282
282
  - knowledge
283
283
  - bias
284
284
  - fairness
285
+ - safety
285
286
  - toxicity
286
287
  - robustness
287
288
  - multilinguality
@@ -293,6 +294,7 @@ run_groups:
293
294
  - vqa_base
294
295
  - viz_wiz
295
296
  - flickr30k
297
+ - pope
296
298
  - name: reasoning
297
299
  display_name: Reasoning
298
300
  description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
@@ -301,13 +303,6 @@ run_groups:
301
303
  - gqa
302
304
  - math_vista
303
305
  - seed_bench
304
- - name: real_world_reasoning
305
- display_name: Real-world Reasoning
306
- description: Reasoning in the real-world
307
- category: Core scenarios
308
- subgroups:
309
- - gqa
310
- - seed_bench
311
306
  - mementos
312
307
  - real_world_qa
313
308
  - name: knowledge
@@ -319,7 +314,6 @@ run_groups:
319
314
  - mmmu
320
315
  - mme
321
316
  - vibe_eval
322
- - real_world_qa
323
317
  - name: bias
324
318
  display_name: Bias
325
319
  description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
@@ -335,13 +329,19 @@ run_groups:
335
329
  - a_okvqa_dialect
336
330
  - crossmodal_3600
337
331
  - fair_face
332
+ - bingo_fairness
338
333
  - name: toxicity
339
334
  display_name: Toxicity
340
335
  description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
341
336
  category: Core scenarios
342
337
  subgroups:
343
- - mm_safety_bench
344
338
  - hateful_memes
339
+ - name: safety
340
+ display_name: Safety
341
+ description: Refusing to produce answers that cause harm to humans
342
+ category: Core scenarios
343
+ subgroups:
344
+ - mm_safety_bench
345
345
  - name: robustness
346
346
  display_name: Robustness
347
347
  description: Is the model robust to perturbations? We focus on both text and image perturbations.
@@ -351,7 +351,6 @@ run_groups:
351
351
  - a_okvqa_robustness
352
352
  - unicorn
353
353
  - bingo
354
- - pope
355
354
  - name: multilinguality
356
355
  display_name: Multilinguality
357
356
  description: Do the model support non-English languages?
@@ -362,7 +361,7 @@ run_groups:
362
361
  - a_okvqa_spanish
363
362
  - a_okvqa_swahili
364
363
  - exams_v
365
-
364
+ - bingo_multilinguality
366
365
  - name: a_okvqa_base
367
366
  display_name: A-OKVQA
368
367
  description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
@@ -684,6 +683,38 @@ run_groups:
684
683
  when: "2023"
685
684
  language: English, Chinese, Japanese, etc.
686
685
 
686
+ - name: bingo_fairness
687
+ display_name: Bingo (fairness)
688
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
689
+ metric_groups:
690
+ - accuracy
691
+ - general_information
692
+ environment:
693
+ main_name: prometheus_vision
694
+ main_split: test
695
+ taxonomy:
696
+ task: short-answer question answering
697
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
698
+ who: Human experts
699
+ when: "2023"
700
+ language: English, Chinese, Japanese, etc.
701
+
702
+ - name: bingo_multilinguality
703
+ display_name: Bingo (multilinguality)
704
+ description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
705
+ metric_groups:
706
+ - accuracy
707
+ - general_information
708
+ environment:
709
+ main_name: prometheus_vision
710
+ main_split: test
711
+ taxonomy:
712
+ task: short-answer question answering
713
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
714
+ who: Human experts
715
+ when: "2023"
716
+ language: English, Chinese, Japanese, etc.
717
+
687
718
  - name: pope
688
719
  display_name: POPE
689
720
  description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
@@ -0,0 +1,10 @@
1
+ import{r as l,a as ys,L as f,O as Es,d as Ms,u as Se,f as Le,H as Rs,h as Is,i as B,R as Ss}from"./react-d4a0b69b.js";import{g as Y,b as X,m as ce,s as ke,a as Ls,d as Ae,y as ks,c as ye,e as me,l as he}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))a(r);new MutationObserver(r=>{for(const i of r)if(i.type==="childList")for(const c of i.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&a(c)}).observe(document,{childList:!0,subtree:!0});function n(r){const i={};return r.integrity&&(i.integrity=r.integrity),r.referrerPolicy&&(i.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?i.credentials="include":r.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function a(r){if(r.ep)return;r.ep=!0;const i=n(r);fetch(r.href,i)}})();var Ce={exports:{}},re={};/**
2
+ * @license React
3
+ * react-jsx-runtime.production.min.js
4
+ *
5
+ * Copyright (c) Facebook, Inc. and its affiliates.
6
+ *
7
+ * This source code is licensed under the MIT license found in the
8
+ * LICENSE file in the root directory of this source tree.
9
+ */var Cs=l,Ps=Symbol.for("react.element"),Ts=Symbol.for("react.fragment"),Bs=Object.prototype.hasOwnProperty,Ds=Cs.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Hs={key:!0,ref:!0,__self:!0,__source:!0};function Pe(s,t,n){var a,r={},i=null,c=null;n!==void 0&&(i=""+n),t.key!==void 0&&(i=""+t.key),t.ref!==void 0&&(c=t.ref);for(a in t)Bs.call(t,a)&&!Hs.hasOwnProperty(a)&&(r[a]=t[a]);if(s&&s.defaultProps)for(a in t=s.defaultProps,t)r[a]===void 0&&(r[a]=t[a]);return{$$typeof:Ps,type:s,key:i,ref:c,props:r,_owner:Ds.current}}re.Fragment=Ts;re.jsx=Pe;re.jsxs=Pe;Ce.exports=re;var e=Ce.exports,oe={},Ee=ys;oe.createRoot=Ee.createRoot,oe.hydrateRoot=Ee.hydrateRoot;function Us({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Os=l.forwardRef(Us),Te=Os;function Fs({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const _s=l.forwardRef(Fs),zs=_s;function Vs({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Ws=l.forwardRef(Vs),qs=Ws,Be=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,De=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Gs({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Qs=l.forwardRef(Gs),Ks=Qs;function Js({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Xs=l.forwardRef(Js),Ys=Xs;function $s({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Zs=l.forwardRef($s),et=Zs;function st({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const tt=l.forwardRef(st),He=tt;function nt({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const at=l.forwardRef(nt),rt=at;function lt({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=l.forwardRef(lt),ct=it;function xe(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Ue(){const[s,t]=l.useState([]),[n,a]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{if(t(r),window.PROJECT_ID){const i=r.find(c=>c.id===window.PROJECT_ID);a(i)}else{const i=r.find(c=>c.id==="lite");a(i)}}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(He,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((r,i)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:xe(void 0,r.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===r.title?"underline":"",children:r.title}),": ",r.description]})},i))})]})}function A(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function $(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ot(s){try{return await(await fetch(A(`${$()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=l.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,a]=l.useState();l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(o=>o.json()).then(o=>{if(window.PROJECT_ID){const m=o.find(p=>p.id===window.PROJECT_ID);a(m)}else{const m=o.find(p=>p.id==="lite");a(m)}}).catch(o=>{console.error("Error fetching JSON:",o)})},[]);function r(){return n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"]}l.useEffect(()=>{const o=new AbortController;async function m(){const p=await ot(o.signal);t(p)}return m(),()=>o.abort()},[]);const i=r();if(!s.release&&!s.suite)return null;const c=`Release ${s.release||s.suite} (${s.date})`;return i.length<=1?e.jsx("div",{children:c}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",e.jsx(He,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:i.map(o=>e.jsx("li",{children:e.jsx("a",{href:xe(o,n?n.id:"lite"),className:"block",role:"menuitem",children:o})},o))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Te,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Be,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(Ue,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ht(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Te,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(f,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Be,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(Ue,{})]})]})}function xt(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(ht,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Es,{})})})]})}async function C(s){try{return await(await fetch(A(`${$()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function ut({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function Q({value:s}){return e.jsx("span",{children:e.jsx(Ms,{components:{a:ut},children:s})})}function k({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(Q,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},gt={open:"Open",limited:"Limited",closed:"Closed"};function pt({level:s}){return e.jsx(Y,{color:ft[s],children:gt[s]})}function D(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function jt(){const[s,t]=l.useState([]);l.useEffect(()=>{const c=new AbortController;async function o(){const m=await C(c.signal);t(m.models)}return o(),()=>c.abort()},[]);const[n,a,r]=s.reduce((c,o)=>{switch(o.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),i=Object.values(s.reduce((c,o)=>{const m=o.creator_organization;return c[m]===void 0?(c[m]={name:m,models:1},c):(c[m].models+=1,c)},{}));return s.length===0?e.jsx(D,{}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(Q,{value:c.description})}),e.jsx("td",{children:e.jsx(pt,{level:c.access})})]}))})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(X,{className:"flex flex-col justify-between",children:[e.jsx(ce,{children:"Models"}),e.jsx(ke,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ls,{values:[n,a,r],colors:["green","yellow","red"]}),e.jsx(Ae,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(X,{className:"md:col-span-2",children:[e.jsx(ce,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(ks,{data:i,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(Ae,{categories:i.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function ne({to:s,children:t,inTable:n=!1,title:a=""}){return n?e.jsx(f,{className:"link link-hover",to:s,title:a,children:t}):e.jsx(f,{className:"link link-primary link-hover",to:s,children:t})}function bt(){const[s,t]=l.useState([]);l.useEffect(()=>{const a=new AbortController;async function r(){const i=await C(a.signal);t(i.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return r(),()=>a.abort()},[]);const n=Object.values(s.reduce((a,r)=>{var c;const i=((c=r.taxonomy)==null?void 0:c.task)||"Unknown";return a[i]===void 0?(a[i]={name:i,value:1},a):(a[i].value+=1,a)},{}));return s.length===0?e.jsx(D,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(a=>{var r,i,c,o,m;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(ne,{to:`/groups/${a.name}`,children:e.jsx("span",{className:"text-lg",children:a.display_name})}),e.jsx("span",{className:"block",children:a.name})]}),e.jsx("td",{children:((r=a.taxonomy)==null?void 0:r.task)||""}),e.jsx("td",{children:((i=a.taxonomy)==null?void 0:i.what)||""}),e.jsx("td",{children:((c=a.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((o=a.taxonomy)==null?void 0:o.when)||""}),e.jsx("td",{children:((m=a.taxonomy)==null?void 0:m.language)||""}),e.jsx("td",{children:e.jsx(Q,{value:a.description})})]})})})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(X,{className:"flex flex-col",children:[e.jsx(ce,{children:"Total scenarios"}),e.jsx(ke,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(X,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ye,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(ye,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function wt(){return A(`${$()}/groups.json`)}async function Oe(s){try{return await(await fetch(wt(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function ue({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function ae({active:s=!1,onClick:t=()=>{},size:n="md",children:a}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:a})}function vt({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),l.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const Nt=l.forwardRef(vt),Me=Nt;function G(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Re({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const a=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const r="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(r)}})();return a?e.jsx(ne,{to:a,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[G(s.value),!n&&e.jsx(Me,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}return s.href?e.jsx(ne,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!n&&e.jsx(Me,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(Q,{value:String(s.value)}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}function fe({schema:s,groupTable:t,numRowsToDisplay:n,sortColumnIndex:a=1,sortable:r=!0,displayColumnIndexes:i=void 0,miniStyle:c=!1}){const[o,m]=l.useState(1),[p,E]=l.useState(a);function w(h){return h.length>30?h.substring(0,27)+"...":h}const M=h=>{const g=["AIRBench 2024 -","-book"];if(h.value==="Model/adapter")return"Model";if(g.some(d=>h.value.includes(d))){let d=h.value;return g.forEach(j=>{d=d.replace(j,"")}),w(d)}else return w(h.value)},I=h=>{if(s){const g=s.models.find(d=>d.display_name===h);if(g){let d=g.description;return d.includes("/")&&(d=d.replace("/","_")),d}}return""},P=h=>{m(h===p?o*-1:h===0?-1:1),E(h)},u=h=>{if(s){const g=s.models.find(d=>d.display_name===h);if(g){let d=g.name;return d.includes("/")&&(d=d.replace("/","_")),d}}return""},H=()=>{const h=t.header[p].lower_is_better,g=o*(h?1:-1),d=t.rows.slice();return d.sort((j,F)=>{var Z,U;const T=(Z=j[p])==null?void 0:Z.value,S=(U=F[p])==null?void 0:U.value;return T!==void 0&&S===void 0?-1:S!==void 0&&T===void 0?1:typeof T=="number"&&typeof S=="number"?(T-S)*g:typeof T=="string"&&typeof S=="string"?g===1?T.localeCompare(S):S.localeCompare(T):0}),n>0?d.slice(0,n):d};function R(h){const g=h.lastIndexOf(" - ");return g===-1?h:h.substring(0,g)+"*"+h.substring(g+1)}const K=h=>{const d=R(h).split("*")[0].trim();if(s){const j=s.run_groups.find(F=>F.display_name===d||F.short_display_name===d);if(j)return j.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((h,g)=>i===void 0||i.includes(g)).map((h,g)=>e.jsx("th",{className:`${g===p?"bg-gray-100":"bg-white"} ${g===0?"left-0 z-40":""} ${h.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:h.description?h.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:M(h)}),r?e.jsx("button",{className:"link",onClick:()=>P(g),children:e.jsx(rt,{className:"w-6 h-6"})}):null]})},`$${g}`))})}),e.jsx("tbody",{children:H().map((h,g)=>e.jsx("tr",{children:h.filter((d,j)=>i===void 0||i.includes(j)).map((d,j)=>e.jsx("td",{className:`${j===0?"z-20 text-lg sticky left-0":"z-0"} ${g%2===0?"bg-gray-50":"bg-white"}`,children:j==1?e.jsx("div",{className:`${d&&d.style&&d.style["font-weight"]&&d.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Re,{value:{...d,href:"/runs/?q="+u(String(h[0].value))},title:`Click value to see all predictions for: ${u(String(h[0].value))}`})}):e.jsx("div",{className:`${d&&d.style&&d.style["font-weight"]&&d.style["font-weight"]==="bold"?"font-bold":""} ${j===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(Re,{value:{...d},title:String(h[0].value)===d.value?I(String(h[0].value)):`Click value to see predictions for ${String(h[0].value)} for ${K(M(t.header[j]))}: ${u(String(h[0].value))}`})})},`${j}`))},`$${h[0].value}`))})]})}function At(){const[s,t]=l.useState(0),[n,a]=l.useState(),[r,i]=l.useState();return l.useEffect(()=>{const c=new AbortController;async function o(){const m=C(c.signal),p=Oe(c.signal),E=await m;i(E);const w=await p;a(w)}return o(),()=>c.abort()},[]),n===void 0||r===void 0?e.jsx(D,{}):n.length===0?e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[n.length>1?e.jsx(ue,{children:n.map((c,o)=>e.jsx(ae,{active:o===s,onClick:()=>t(o),children:c.title},o))}):null,e.jsx(fe,{schema:r,groupTable:n[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Fe(s,t){try{return await(await fetch(A(`${$()}/groups/${s}.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function _e({schema:s,runGroupName:t,numRowsToDisplay:n=-1}){const[a,r]=l.useState(),[i,c]=l.useState(0);return l.useEffect(()=>{const o=new AbortController;async function m(){const p=await Fe(t,o.signal);r(p)}return m(),()=>o.abort()},[s,t]),a===void 0||a.length===0?e.jsx(D,{}):a.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[a.length>1?e.jsx(ue,{children:a.map((o,m)=>e.jsx(ae,{active:m===i,onClick:()=>c(m),children:o.title},m))}):null,e.jsx(fe,{schema:s,groupTable:a[i],numRowsToDisplay:n,sortColumnIndex:1},`${t}-${i}`)]})}function yt(){const{groupName:s}=Se(),[t,n]=l.useState(void 0);l.useEffect(()=>{const i=new AbortController;async function c(){const m=await C(i.signal);n(m)}return c(),()=>i.abort()},[]);const r=(()=>{if(t!==void 0){for(const i of t.run_groups)if(i.name===s)return i}})();return t===void 0?e.jsx(D,{}):r===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:r.display_name,subtitle:r.description,markdown:!0,className:"mr-8"}),e.jsx(_e,{schema:t,runGroupName:r.name},r.name)]})}async function Et(s){try{return await(await fetch(A(`${$()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function de({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:a,className:r}){let i="join";return r!==void 0&&(i=`join ${r}`),e.jsxs("div",{className:i,children:[e.jsx("button",{onClick:a,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const ie=100;function Mt(){const[s,t]=Le(),[n,a]=l.useState(),[r,i]=l.useState(Number(s.get("page")||1)),[c,o]=l.useState(!0),[m,p]=l.useState(s.get("q")||"");l.useEffect(()=>{const u=new AbortController;async function H(){const R=await Et(u.signal);a(R)}return H(),()=>u.abort()},[]);const E=u=>{u.preventDefault();const R=u.target.q.value;p(R),t({q:R,page:"1"})};if(n===void 0)return e.jsx(D,{});const w=c?new RegExp(m):null,M=n.filter(u=>w?w.test(u.name):u.name.includes(m)),I=M.slice((r-1)*ie,r*ie),P=Math.ceil(M.length/ie);return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:E,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:m,onChange:u=>p(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>o(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${M.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ct,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:I.map((u,H)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(ne,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${H}`))})]})}),P>0?e.jsx(de,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(r+1,P);i(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(r-1,1);i(u),s.set("page",String(u)),t(s)},currentPage:r,totalPages:P}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function V(){return window.SUITE!==void 0?window.SUITE:void 0}async function Rt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/instances.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function It(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/stats.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function St(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/display_requests.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function Lt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/display_predictions.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name==="AbortError"&&console.log(a),[]}}async function kt(s,t,n){try{return await(await fetch(A(`/runs/${n||V()}/${s}/scenario.json`),{signal:t})).json()}catch(a){a instanceof Error&&a.name!=="AbortError"&&console.log(a);return}}function ze(s,t){return A(`/runs/${t||V()}/${s}/run_spec.json`)}async function Ct(s,t,n){try{return await(await fetch(ze(s,n),{signal:t})).json()}catch(a){a instanceof Error&&a.name!=="AbortError"&&console.log(a);return}}function Pt(s,t){return A(`/runs/${t||V()}/${s}/scenario_state.json`)}function Tt(s){const n={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},a=Object.keys(s);for(const r of a)if(s[r]!==void 0&&n[r]!==void 0)return n[r]?s[r]<.5?[r,!0]:[r,!1]:s[r]>=.5?[r,!0]:[r,!1];return["",!1]}function Bt(s){const[t,n]=Tt(s.stats);return t===""?null:n?e.jsx(Dt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(Ht,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function Dt({value:s}){return e.jsx(Y,{icon:zs,color:"green",children:s})}function Ht({value:s}){return e.jsx(Y,{icon:qs,color:"red",children:s})}function z({value:s}){const[t,n]=l.useState(!1),[a,r]=l.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>r(!0),children:e.jsx(et,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:a,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>r(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function Ve({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=A(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function We({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Ve,{mediaObject:t}))})}function Ut(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Ot({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(z,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(We,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(me,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(he,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:Ut(s.request[t])}):"null"]},n+1))})]})}function Ft(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(z,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(z,{value:t.text})," "]}),t.media_object&&e.jsx(Ve,{mediaObject:t.media_object})]},n))})}function _t(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(z,{value:n===null?"null":n.toString()})]}))})}function zt({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(n)?Ft(n):_t(n)})]},t)):null})}function Vt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((a,r)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",a.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:a.base64_images&&a.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),a.base64_images.map(i=>e.jsx("img",{src:"data:image;base64,"+i,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Bt,{stats:a.stats})]}),e.jsx(z,{value:a.predicted_text}),a.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(z,{value:String(a.mapped_output)})]}):null]})}),e.jsx(zt,{predictionAnnotations:a.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(me,{children:Object.keys(a.stats).map((i,c)=>e.jsxs(he,{children:[n[i]?e.jsx("span",{title:n[i].description,children:n[i].display_name}):e.jsx("span",{children:i}),e.jsx("span",{children:String(a.stats[i])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Ot,{request:t[r]})})]})]},r))})})}const Wt="correct";function qt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(a=>e.jsx(Y,{className:"mx-2",color:a===Wt?"green":void 0,children:a}))]},n))})]})}function Gt({instance:s,requests:t,predictions:n,metricFieldMap:a}){const r=i=>i.perturbation===void 0?`Instance id: ${i.id} [split: ${i.split}]`:`Instance id: ${i.id} [split: ${i.split}][perturbation: ${i.perturbation.name}]`;return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:r(s)}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(We,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(z,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(qt,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(Vt,{predictions:n,requests:t,metricFieldMap:a}):null})]})}function Qt({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}function qe(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Kt(s){try{return await(await fetch(A(`/releases/${qe()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function Jt(s,t){return qe()?s[t]:window.SUITE}const se=10,te=50;function Xt(){const{runName:s}=Se(),[t,n]=Le(),[a,r]=l.useState(0),[i,c]=l.useState(),[o,m]=l.useState(),[p,E]=l.useState([]),[w,M]=l.useState([]),[I,P]=l.useState(),[u,H]=l.useState(),[R,K]=l.useState(1),[h,g]=l.useState(1),[d,j]=l.useState(1),[F,T]=l.useState(1),[S,Z]=l.useState(),[U,xs]=l.useState(),[ge,us]=l.useState({}),[pe,fs]=l.useState({}),[je,gs]=l.useState("");if(l.useEffect(()=>{const x=new AbortController;async function O(){const v=x.signal;if(s===void 0)return()=>x.abort();const L=window.SUITE?window.SUITE:Jt(await Kt(v),s);m(L);const[ee,be,we,bs,ws,vs,le]=await Promise.all([Ct(s,v,L),Rt(s,v,L),It(s,v,L),kt(s,v,L),Lt(s,v,L),St(s,v,L),C(v)]);c(ee),E(be);const ve=Math.ceil(be.length/se),Ns=Number(t.get("instancesPage")||1);g(ve),K(Math.max(Math.min(Ns,ve),1)),M(we),xs(bs);const Ne=Math.floor(we.length/te),As=Number(t.get("metricsPage")||1);T(Ne),j(Math.max(Math.min(As,Ne),1));const W={};vs.forEach(N=>{var J;const b=N.instance_id,_=((J=N.perturbation)==null?void 0:J.name)||"";W[b]===void 0&&(W[b]={}),W[b][_]===void 0&&(W[b][_]=[]),W[b][_].push(N)}),H(W);const q={};ws.forEach(N=>{var J;const b=N.instance_id,_=((J=N.perturbation)==null?void 0:J.name)||"";q[b]===void 0&&(q[b]={}),q[b][_]===void 0&&(q[b][_]=[]),q[b][_].push(N)}),P(q),fs(le.metrics.reduce((N,b)=>(N[b.name]=b,N),{})),us(le.adapter.reduce((N,b)=>(N[b.name]=b,N),{})),Z(le.models.find(N=>N.name===(ee==null?void 0:ee.adapter_spec.model)))}return O(),()=>x.abort()},[s,t]),i===void 0||I===void 0||u===void 0||U===void 0)return e.jsx(D,{});const ps=p.slice((R-1)*se,(R-1)*se+se),js=w.slice((d-1)*te,(d-1)*te+te);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[U.name,e.jsx("a",{href:"/#/groups/"+U.name,children:e.jsx(Ys,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(Q,{value:U.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:i.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(Q,{value:(S==null?void 0:S.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:U.tags.map(x=>e.jsx(Y,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:x})}))})]})}),e.jsxs(X,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Ks,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:ze(i.name,o),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Pt(i.name,o),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(me,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(i.adapter_spec).map(([x,O],v)=>e.jsxs(he,{className:v<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:ge[x]?ge[x].description:void 0,children:`${x}: `}),e.jsx("span",{className:"overflow-x-auto",children:O})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(ue,{children:[e.jsx(ae,{size:"lg",active:a===0,onClick:()=>r(0),children:"Instances + Predictions"}),e.jsx(ae,{size:"lg",active:a===1,onClick:()=>r(1),children:"All metrics"})]})}),a===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:ps.map((x,O)=>{var v,L;return e.jsx(Gt,{instance:x,requests:u[x.id][((v=x.perturbation)==null?void 0:v.name)||""],predictions:I[x.id][((L=x.perturbation)==null?void 0:L.name)||""],metricFieldMap:pe},`${x.id}-${O}`)})}),e.jsx(de,{className:"flex justify-center my-8",onNextPage:()=>{const x=Math.min(R+1,h);K(x),t.set("instancesPage",String(x)),n(t)},onPrevPage:()=>{const x=Math.max(R-1,1);K(x),t.set("instancesPage",String(x)),n(t)},currentPage:R,totalPages:h})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:x=>gs(x.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(w[0]).map(x=>e.jsx("th",{children:x},x))})}),e.jsx("tbody",{children:js.filter(x=>!je||x.name.name.toLowerCase().includes(je.toLowerCase())).map(x=>e.jsx("tr",{children:Object.entries(x).map(([O,v])=>O==="name"?e.jsx("td",{children:e.jsx(Qt,{stat:x,metricFieldMap:pe})},O):e.jsx("td",{children:v}))}))})]})}),e.jsx(de,{className:"flex justify-center my-8",onNextPage:()=>{const x=Math.min(d+1,F);j(x),t.set("metricsPage",String(x)),n(t)},onPrevPage:()=>{const x=Math.max(d-1,1);j(x),t.set("metricsPage",String(x)),n(t)},currentPage:d,totalPages:F})]})]})}function Yt(){const[s,t]=l.useState(void 0),[n,a]=l.useState(void 0),[r,i]=l.useState(void 0);if(l.useEffect(()=>{const o=new AbortController;async function m(){const p=C(o.signal),E=Oe(o.signal),w=await p;t(w);const M=await E,I=[];M.forEach(P=>{P.rows.forEach(u=>{I.push({title:String(u[0].value),name:u[0].href.replace("?group=","")})})}),a(I)}return m(),()=>o.abort()},[]),s===void 0||n===void 0)return e.jsx(D,{});if(n.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=r!==void 0?r:n[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:o=>i(o.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:n.map((o,m)=>e.jsx("option",{value:o.name,children:o.title},m))})]})]}),e.jsx(_e,{schema:s,runGroupName:c},c)]})}const $t=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Zt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function en(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:$t,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Zt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Ge({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(f,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function Qe({runGroups:s}){const t=new Map(s.filter(r=>r.metric_groups!==void 0&&(r.subgroups===void 0||r.subgroups.length===0)).map(r=>[r.name,r])),n=new Set,a=[];return s.forEach(r=>{const i=r.subgroups?r.subgroups:[],c=[];i.forEach(o=>{const m=t.get(o);m&&(c.push(m),n.add(m.name))}),c.length>0&&a.push([r,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:a.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx(f,{className:"text-black",to:"groups/"+r.name,children:e.jsx("h2",{children:r.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(f,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},r.name))})]})}const Ke=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function y({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:n=10,sortColumnIndex:a=1}){const[r,i]=l.useState(void 0),[c,o]=l.useState(void 0);return l.useEffect(()=>{const m=new AbortController;async function p(){const E=await C(m.signal);i(E);const w=E.run_groups;if(w.length===0)return;const M=s||w[0].name,I=await Fe(M,m.signal);o(I[t])}return p(),()=>m.abort()},[s,t]),r===void 0||c===void 0?e.jsx(D,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(fe,{schema:r,groupTable:c,numRowsToDisplay:n,sortColumnIndex:a,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function sn(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ke,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Je=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Xe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,Ye=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,$e=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,Ze=""+new URL("cohere-3550c6cb.png",import.meta.url).href,es=""+new URL("eleutherai-b9451114.png",import.meta.url).href,ss=""+new URL("google-06d997ad.png",import.meta.url).href,ts=""+new URL("meta-5580e9f1.png",import.meta.url).href,ns=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,as=""+new URL("mistral-18e1be23.png",import.meta.url).href,rs=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,ls=""+new URL("openai-3f8653e4.png",import.meta.url).href,is=""+new URL("tii-24de195c.png",import.meta.url).href,cs=""+new URL("together-a665a35b.png",import.meta.url).href,os=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,ds="",ms=""+new URL("yandex-38e09d70.png",import.meta.url).href,hs=""+new URL("01-694cb9b7.png",import.meta.url).href,tn=[Je,Xe,Ye,$e,Ze,es,ss,ts,ns,as,rs,ls,is,cs,os,ds,ms,hs];function Ie(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function a(){const r=await C(n.signal);t(r)}return a(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(sn,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:tn.map((n,a)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},a))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ge,{models:s.models}),e.jsx(Qe,{runGroups:s.run_groups})]})})]})]}):null}function nn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(y,{})})]})]})}const an=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function rn(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:an,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const ln=""+new URL("scb10x-204bd786.png",import.meta.url).href,cn=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function on(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:cn,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:ln,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const dn=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function mn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:dn,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const hn=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function xn({metricFieldMap:s,metricGroups:t}){const n=new Set,a=[];return t.forEach(r=>{const i=[];r.metrics.forEach(c=>{const o=s[c.name];o&&(i.push(o),n.add(o.name))}),i.length>0&&a.push([r,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:a.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:r.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},r.name))})]})}function un(){const[s,t]=l.useState(void 0);l.useEffect(()=>{const a=new AbortController;async function r(){const i=await C(a.signal);t(i)}return r(),()=>a.abort()},[]);const n=s?s.metrics.reduce((a,r)=>(a[r.name]=r,a),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:hn,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Ge,{models:s.models}),e.jsx(Qe,{runGroups:s.run_groups}),e.jsx(xn,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const fn=""+new URL("vhelm-framework-a1ca3f3f.png",import.meta.url).href,gn=""+new URL("vhelm-model-8afb7616.png",import.meta.url).href,pn=""+new URL("vhelm-aspects-1437d673.png",import.meta.url).href;function jn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:gn,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:fn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(y,{}),e.jsx(f,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:pn,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}const bn=""+new URL("accenture-6f97eeda.png",import.meta.url).href,wn=""+new URL("cresta-9e22b983.png",import.meta.url).href;function vn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:bn,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:wn,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Nn=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function An(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:Nn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const yn="";function En(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:yn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Mn=({id:s,title:t,text:n})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:xe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function Rn(){const[s,t]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((n,a)=>n.id==="home"?null:e.jsx(Mn,{id:n.id,title:n.title,text:n.description},a))})})}function In(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
10
+ mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ke,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const Sn=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,Ln=[Je,Sn,Xe,Ye,$e,Ze,es,ss,ts,ns,as,rs,ls,is,cs,os,ds,ms,hs];function kn(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function a(){const r=await C(n.signal);t(r)}return a(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(In,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(Rn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Ln.map((n,a)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},a))})})})]})})]}):null}const Cn=""+new URL("overview-74aea3d8.png",import.meta.url).href,Pn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function Tn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"TODO",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:Cn,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms form ArXiV papers"}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript, ..."}),e.jsx("li",{children:"Music sheets: crops of measures from music sheets from IMSLP"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(y,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:Pn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function Bn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Dn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(y,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Hn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsx("div",{className:"flex flex-row justify-center mt-4",children:e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(y,{})})]})]})}function Un(){return window.PROJECT_ID==="lite"?e.jsx(Ie,{}):window.PROJECT_ID==="instruct"?e.jsx(en,{}):window.PROJECT_ID==="image2struct"?e.jsx(Tn,{}):window.PROJECT_ID==="heim"?e.jsx(un,{}):window.PROJECT_ID==="mmlu"?e.jsx(nn,{}):window.PROJECT_ID==="vhelm"?e.jsx(jn,{}):window.PROJECT_ID==="air-bench"?e.jsx(rn,{}):window.PROJECT_ID==="thaiexam"?e.jsx(on,{}):window.PROJECT_ID==="finance"?e.jsx(mn,{}):window.PROJECT_ID==="call-center"?e.jsx(vn,{}):window.PROJECT_ID==="cleva"?e.jsx(An,{}):window.PROJECT_ID==="tables"?e.jsx(En,{}):window.PROJECT_ID==="ewok"?e.jsx(Bn,{}):window.PROJECT_ID==="medical"?e.jsx(Dn,{}):window.PROJECT_ID==="safety"?e.jsx(Hn,{}):window.PROJECT_ID==="home"?e.jsx(kn,{}):e.jsx(Ie,{})}function On(){return e.jsx(Rs,{children:e.jsx(Is,{children:e.jsxs(B,{path:"/",element:e.jsx(xt,{}),children:[e.jsx(B,{index:!0,element:e.jsx(Un,{})}),e.jsx(B,{path:"leaderboard",element:e.jsx(Yt,{})}),e.jsx(B,{path:"models",element:e.jsx(jt,{})}),e.jsx(B,{path:"scenarios",element:e.jsx(bt,{})}),e.jsx(B,{path:"groups",element:e.jsx(At,{})}),e.jsx(B,{path:"groups/:groupName",element:e.jsx(yt,{})}),e.jsx(B,{path:"runs",element:e.jsx(Mt,{})}),e.jsx(B,{path:"runs/:runName",element:e.jsx(Xt,{})})]})})})}oe.createRoot(document.getElementById("root")).render(e.jsx(Ss.StrictMode,{children:e.jsx(On,{})}));
@@ -7,7 +7,7 @@
7
7
  <title>Holistic Evaluation of Language Models (HELM)</title>
8
8
  <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
9
9
  <script type="text/javascript" src="./config.js"></script>
10
- <script type="module" crossorigin src="./assets/index-58f97dcd.js"></script>
10
+ <script type="module" crossorigin src="./assets/index-3ee38b3d.js"></script>
11
11
  <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
12
12
  <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
13
13
  <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
@@ -1,6 +1,5 @@
1
1
  from helm.common.authentication import Authentication
2
2
  from helm.common.tokenization_request import (
3
- WindowServiceInfo,
4
3
  TokenizationRequest,
5
4
  TokenizationRequestResult,
6
5
  DecodeRequest,
@@ -25,7 +24,3 @@ class TokenizerService:
25
24
  def decode(self, request: DecodeRequest) -> DecodeRequestResult:
26
25
  """Decode via an API."""
27
26
  return self._service.decode(self._auth, request)
28
-
29
- def get_info(self, model_name: str) -> WindowServiceInfo:
30
- """Get info via an API."""
31
- return self._service.get_window_service_info(model_name)
@@ -51,7 +51,7 @@ class OpenAIClient(CachingClient):
51
51
  def _is_chat_model_engine(self, model_engine: str) -> bool:
52
52
  if model_engine == "gpt-3.5-turbo-instruct":
53
53
  return False
54
- elif model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4"):
54
+ elif model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4") or model_engine.startswith("o1"):
55
55
  return True
56
56
  return False
57
57
 
@@ -169,6 +169,21 @@ class OpenAIClient(CachingClient):
169
169
  if is_vlm(request.model) and raw_request["stop"] is None:
170
170
  raw_request.pop("stop")
171
171
 
172
+ # Special handling for o1 models.
173
+ # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
174
+ # https://platform.openai.com/docs/guides/reasoning
175
+ if request.model_engine.startswith("o1"):
176
+ # Avoid error:
177
+ # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
178
+ # Note that openai>=1.45 is needed for this
179
+ if raw_request["max_tokens"]:
180
+ raw_request["max_completion_tokens"] = raw_request["max_tokens"]
181
+ raw_request.pop("max_tokens")
182
+ # Avoid error:
183
+ # "Invalid type for 'stop': expected an unsupported value, but got null instead."
184
+ if raw_request["stop"] is None:
185
+ raw_request.pop("stop")
186
+
172
187
  def do_it() -> Dict[str, Any]:
173
188
  return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
174
189
 
@@ -154,7 +154,6 @@ class PalmyraChatClient(OpenAIClient):
154
154
  tokenizer_name: str,
155
155
  cache_config: CacheConfig,
156
156
  api_key: str,
157
- base_url: str,
158
157
  ):
159
158
  super().__init__(
160
159
  tokenizer=tokenizer,
@@ -162,7 +161,7 @@ class PalmyraChatClient(OpenAIClient):
162
161
  cache_config=cache_config,
163
162
  api_key=api_key,
164
163
  org_id=None,
165
- base_url=base_url,
164
+ base_url="https://api.writer.com/v1/chat",
166
165
  )
167
166
 
168
167
  def _is_chat_model_engine(self, model_engine: str) -> bool: