crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -192,6 +192,93 @@ metrics:
192
192
  description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
193
193
  lower_is_better: false
194
194
 
195
+ - name: rouge_1
196
+ display_name: ROUGE-1
197
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
198
+ lower_is_better: false
199
+ - name: rouge_2
200
+ display_name: ROUGE-2
201
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
202
+ lower_is_better: false
203
+ - name: rouge_l
204
+ display_name: ROUGE-L
205
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
206
+ lower_is_better: false
207
+ - name: bleu_1
208
+ display_name: BLEU-1
209
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
210
+ lower_is_better: false
211
+ - name: bleu_4
212
+ display_name: BLEU-4
213
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
214
+ lower_is_better: false
215
+ - name: f1_score
216
+ display_name: F1
217
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
218
+ lower_is_better: false
219
+ - name: cider
220
+ display_name: CIDEr
221
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
222
+ lower_is_better: false
223
+
224
+ # Bias metrics:
225
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
226
+ display_name: Stereotypical associations (race, profession)
227
+ short_display_name: Stereotypes (race)
228
+ lower_is_better: true
229
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
230
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
231
+ display_name: Stereotypical associations (race, adjectives)
232
+ short_display_name: Stereotypes (race)
233
+ lower_is_better: true
234
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
235
+
236
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
237
+ display_name: Stereotypical associations (gender, profession)
238
+ short_display_name: Stereotypes (gender)
239
+ lower_is_better: true
240
+ description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
241
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
242
+ display_name: Stereotypical associations (gender, adjectives)
243
+ short_display_name: Stereotypes (gender)
244
+ lower_is_better: true
245
+ description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
246
+
247
+ - name: bias_metric:mode=representation,demographic_category=race
248
+ display_name: Demographic representation (race)
249
+ short_display_name: Representation (race)
250
+ lower_is_better: true
251
+ description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
252
+ - name: bias_metric:mode=representation,demographic_category=gender
253
+ display_name: Demographic representation (gender)
254
+ short_display_name: Representation (gender)
255
+ lower_is_better: true
256
+ description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
257
+
258
+ - name: bbq_metric_ambiguous_bias
259
+ display_name: BBQ (ambiguous)
260
+ lower_is_better: true
261
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
262
+ - name: bbq_metric_unambiguous_bias
263
+ display_name: BBQ (unambiguous)
264
+ lower_is_better: true
265
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
266
+
267
+ # Toxicity metrics
268
+ - name: expected_max_toxicity
269
+ display_name: Average max toxicity
270
+ short_display_name: Avg. max toxicity
271
+ lower_is_better: true
272
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
273
+ - name: max_toxicity_probability
274
+ display_name: max toxicity
275
+ lower_is_better: true
276
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
277
+ - name: toxic_frac
278
+ display_name: Toxic fraction
279
+ lower_is_better: true
280
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
281
+
195
282
  ############################################################
196
283
  perturbations:
197
284
  - name: robustness
@@ -255,6 +342,12 @@ metric_groups:
255
342
  - name: num_output_tokens
256
343
  split: ${main_split}
257
344
 
345
+ - name: toxicity
346
+ display_name: Toxicity
347
+ metrics:
348
+ - name: toxic_frac
349
+ split: ${main_split}
350
+
258
351
  - name: generation_image
259
352
  display_name: Generation (image)
260
353
  metrics:
@@ -293,6 +386,74 @@ run_groups:
293
386
  - seed_bench
294
387
  - mme
295
388
 
389
+ - name: a_okvqa
390
+ display_name: A-OKVQA
391
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
392
+ metric_groups:
393
+ - accuracy
394
+ - efficiency
395
+ - general_information
396
+ environment:
397
+ main_name: exact_match
398
+ main_split: valid
399
+ taxonomy:
400
+ task: multiple-choice question answering
401
+ what: Real-world images
402
+ who: Human experts
403
+ when: "2023"
404
+ language: English
405
+
406
+ - name: crossmodal_3600
407
+ display_name: Crossmodal 3600
408
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
409
+ metric_groups:
410
+ - accuracy
411
+ - efficiency
412
+ - general_information
413
+ environment:
414
+ main_name: f1_score
415
+ main_split: test
416
+ taxonomy:
417
+ task: multilingual captioning
418
+ what: Real-world images
419
+ who: Human experts
420
+ when: "2022"
421
+ language: 36 languages
422
+
423
+ - name: flickr30k
424
+ display_name: Flickr30k
425
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
426
+ metric_groups:
427
+ - accuracy
428
+ - efficiency
429
+ - general_information
430
+ environment:
431
+ main_name: f1_score
432
+ main_split: test
433
+ taxonomy:
434
+ task: image captioning
435
+ what: Flickr images
436
+ who: Human experts
437
+ when: "2014"
438
+ language: English
439
+
440
+ - name: gqa
441
+ display_name: GQA
442
+ description: Questions about real-world visual reasoning and compositional QA
443
+ metric_groups:
444
+ - accuracy
445
+ - efficiency
446
+ - general_information
447
+ environment:
448
+ main_name: f1_score
449
+ main_split: valid
450
+ taxonomy:
451
+ task: short answer question answering
452
+ what: Real-world images
453
+ who: Human experts
454
+ when: "2019"
455
+ language: English
456
+
296
457
  - name: heim_human_eval
297
458
  display_name: HEIM Human Eval Scenario
298
459
  description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
@@ -336,26 +497,78 @@ run_groups:
336
497
  when: "2020"
337
498
  language: English
338
499
 
500
+ - name: mm_safety_bench
501
+ display_name: MM-SafetyBench
502
+ description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
503
+ metric_groups:
504
+ - accuracy
505
+ - efficiency
506
+ - toxicity
507
+ - general_information
508
+ environment:
509
+ main_name: toxic_frac
510
+ main_split: test
511
+ taxonomy:
512
+ task: safety
513
+ what: safety images
514
+ who: Human experts
515
+ when: "2023"
516
+ language: English
517
+
518
+ - name: mscoco_captioning
519
+ display_name: MSCOCO (captioning)
520
+ description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
521
+ metric_groups:
522
+ - accuracy
523
+ - efficiency
524
+ - general_information
525
+ environment:
526
+ main_name: f1_score
527
+ main_split: valid
528
+ taxonomy:
529
+ task: image captioning
530
+ what: Real world images
531
+ who: Human experts
532
+ when: "2014"
533
+ language: English
534
+
535
+ - name: mscoco_categorization
536
+ display_name: MSCOCO (categorization)
537
+ description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
538
+ metric_groups:
539
+ - accuracy
540
+ - efficiency
541
+ - general_information
542
+ environment:
543
+ main_name: exact_match
544
+ main_split: valid
545
+ taxonomy:
546
+ task: image captioning
547
+ what: Real world images
548
+ who: Human experts
549
+ when: "2014"
550
+ language: English
551
+
339
552
  - name: viz_wiz
340
553
  display_name: VizWiz
341
- description: The VizWiz benchmark for visual question answering on images taken by blind people [(Gurari et al., 2018)](https://arxiv.org/pdf/1802.08218.pdf).
554
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
342
555
  metric_groups:
343
556
  - accuracy
344
557
  - efficiency
345
558
  - general_information
346
559
  environment:
347
- main_name: exact_match
348
- main_split: test
560
+ main_name: f1_score
561
+ main_split: valid
349
562
  taxonomy:
350
- task: multimodal question answering
351
- what: images and text
352
- who: blind people
563
+ task: multimodal short answer question answering
564
+ what: Real-world images
565
+ who: Visually impaired people
353
566
  when: "2018"
354
567
  language: English
355
568
 
356
569
  - name: vqa
357
570
  display_name: VQAv2
358
- description: Open-ended questions about images
571
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
359
572
  metric_groups:
360
573
  - accuracy
361
574
  - efficiency
@@ -364,15 +577,32 @@ run_groups:
364
577
  main_name: f1_score
365
578
  main_split: valid
366
579
  taxonomy:
367
- task: short answer question answering
580
+ task: multimodal short answer question answering
368
581
  what: Real-world images
369
582
  who: Human experts
370
583
  when: "2017"
371
584
  language: English
372
585
 
586
+ - name: math_vista
587
+ display_name: MathVista
588
+ description: Evaluating Math Reasoning in Visual Contexts
589
+ metric_groups:
590
+ - accuracy
591
+ - efficiency
592
+ - general_information
593
+ environment:
594
+ main_name: exact_match
595
+ main_split: test
596
+ taxonomy:
597
+ task: multiple-choice question answering
598
+ what: Evaluating Math Reasoning in Visual Contexts
599
+ who: Human experts
600
+ when: "2024"
601
+ language: English
602
+
373
603
  - name: mmmu
374
604
  display_name: MMMU
375
- description: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
605
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
376
606
  metric_groups:
377
607
  - accuracy
378
608
  - efficiency
@@ -381,7 +611,7 @@ run_groups:
381
611
  main_name: exact_match
382
612
  main_split: valid
383
613
  taxonomy:
384
- task: multiple-choice question answering
614
+ task: multimodal multiple-choice question answering
385
615
  what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
386
616
  who: Human experts
387
617
  when: "2023"
@@ -574,3 +804,20 @@ run_groups:
574
804
  who: n/a
575
805
  when: "2024"
576
806
  language: English
807
+
808
+ - name: pairs
809
+ display_name: PAIRS
810
+ description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
811
+ metric_groups:
812
+ - accuracy
813
+ - efficiency
814
+ - general_information
815
+ environment:
816
+ main_name: exact_match
817
+ main_split: test
818
+ taxonomy:
819
+ task: multiple-choice question answering
820
+ what: Bias
821
+ who: Human experts
822
+ when: "2024"
823
+ language: English
@@ -0,0 +1,10 @@
1
+ import{r as a,a as Rs,L as E,O as Ms,d as ks,u as Pe,f as _e,H as Ls,h as As,i as D,R as Cs}from"./react-d4a0b69b.js";import{g as X,b as K,m as ce,s as Te,a as Ps,d as Me,y as _s,c as ke,e as ue,l as he}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const l of document.querySelectorAll('link[rel="modulepreload"]'))n(l);new MutationObserver(l=>{for(const o of l)if(o.type==="childList")for(const i of o.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&n(i)}).observe(document,{childList:!0,subtree:!0});function r(l){const o={};return l.integrity&&(o.integrity=l.integrity),l.referrerPolicy&&(o.referrerPolicy=l.referrerPolicy),l.crossOrigin==="use-credentials"?o.credentials="include":l.crossOrigin==="anonymous"?o.credentials="omit":o.credentials="same-origin",o}function n(l){if(l.ep)return;l.ep=!0;const o=r(l);fetch(l.href,o)}})();var $e={exports:{}},ne={};/**
2
+ * @license React
3
+ * react-jsx-runtime.production.min.js
4
+ *
5
+ * Copyright (c) Facebook, Inc. and its affiliates.
6
+ *
7
+ * This source code is licensed under the MIT license found in the
8
+ * LICENSE file in the root directory of this source tree.
9
+ */var Ts=a,$s=Symbol.for("react.element"),Us=Symbol.for("react.fragment"),Is=Object.prototype.hasOwnProperty,Ds=Ts.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Hs={key:!0,ref:!0,__self:!0,__source:!0};function Ue(s,t,r){var n,l={},o=null,i=null;r!==void 0&&(o=""+r),t.key!==void 0&&(o=""+t.key),t.ref!==void 0&&(i=t.ref);for(n in t)Is.call(t,n)&&!Hs.hasOwnProperty(n)&&(l[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)l[n]===void 0&&(l[n]=t[n]);return{$$typeof:$s,type:s,key:o,ref:i,props:l,_owner:Ds.current}}ne.Fragment=Us;ne.jsx=Ue;ne.jsxs=Ue;$e.exports=ne;var e=$e.exports,de={},Le=Rs;de.createRoot=Le.createRoot,de.hydrateRoot=Le.hydrateRoot;function Os({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Bs=a.forwardRef(Os),Ie=Bs;function Fs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const zs=a.forwardRef(Fs),Gs=zs;function qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Js=a.forwardRef(qs),Ws=Js,De=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,He=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Zs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Ks=a.forwardRef(Zs),Oe=Ks;function Vs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Ys=a.forwardRef(Vs),Xs=Ys;function Qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const et=a.forwardRef(Qs),st=et;function tt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const nt=a.forwardRef(tt),Be=nt;function rt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const at=a.forwardRef(rt),Fe=at;function lt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=a.forwardRef(lt),ot=it;function xe(s,t){return t?s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function ze(){const[s,t]=a.useState([]),[r,n]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(l=>l.json()).then(l=>{if(t(l),window.PROJECT_ID){const o=l.find(i=>i.id===window.PROJECT_ID);n(o)}else{const o=l.find(i=>i.id==="lite");n(o)}}).catch(l=>{console.error("Error fetching JSON:",l)})},[]),r===void 0||r.title===void 0?null:e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[r.title," ",e.jsx(Be,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((l,o)=>e.jsx("li",{children:e.jsxs("a",{href:xe(void 0,l.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:r.title===l.title?"underline":"",children:l.title}),": ",l.description]})},o))})]})}function P(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function W(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ct(s){try{return await(await fetch(P(`${W()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=a.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[r,n]=a.useState();a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(m=>m.json()).then(m=>{if(window.PROJECT_ID){const x=m.find(N=>N.id===window.PROJECT_ID);n(x)}else{const x=m.find(N=>N.id==="lite");n(x)}}).catch(m=>{console.error("Error fetching JSON:",m)})},[]);function l(){return r!==void 0&&r.releases!==void 0?r.releases:["v1.0.0"]}a.useEffect(()=>{const m=new AbortController;async function x(){const N=await ct(m.signal);t(N)}return x(),()=>m.abort()},[]);const o=l();if(!s.release&&!s.suite)return null;const i=`Release ${s.release||s.suite} (${s.date})`;return o.length<=1?e.jsx("div",{children:i}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",e.jsx(Be,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:o.map(m=>e.jsx("li",{children:e.jsx("a",{href:xe(m,r?r.id:"lite"),className:"block",role:"menuitem",children:m})}))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]})]})}function ht(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="global"?e.jsx(ut,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Ms,{})})})]})}async function H(s){try{return await(await fetch(P(`${W()}/schema.json`),{signal:s})).json()}catch(t){return console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function xt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function q({value:s}){return e.jsx("span",{children:e.jsx(ks,{components:{a:xt},children:s})})}function I({title:s,subtitle:t,markdown:r=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),r&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(q,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},pt={open:"Open",limited:"Limited",closed:"Closed"};function jt({level:s}){return e.jsx(X,{color:ft[s],children:pt[s]})}function B(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function gt(){const[s,t]=a.useState([]);a.useEffect(()=>{const i=new AbortController;async function m(){const x=await H(i.signal);t(x.models)}return m(),()=>i.abort()},[]);const[r,n,l]=s.reduce((i,m)=>{switch(m.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),o=Object.values(s.reduce((i,m)=>{const x=m.creator_organization;return i[x]===void 0?(i[x]={name:x,models:1},i):(i[x].models+=1,i)},{}));return s.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(q,{value:i.description})}),e.jsx("td",{children:e.jsx(jt,{level:i.access})})]}))})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(K,{className:"flex flex-col justify-between",children:[e.jsx(ce,{children:"Models"}),e.jsx(Te,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ps,{values:[r,n,l],colors:["green","yellow","red"]}),e.jsx(Me,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(K,{className:"md:col-span-2",children:[e.jsx(ce,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(_s,{data:o,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(Me,{categories:o.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function te({to:s,children:t,inTable:r=!1,title:n=""}){return r?e.jsx(E,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(E,{className:"link link-primary link-hover",to:s,children:t})}function vt(){const[s,t]=a.useState([]);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o.run_groups.filter(i=>!i.todo&&i.taxonomy&&!i.display_name.includes("CLEVA")))}return l(),()=>n.abort()},[]);const r=Object.values(s.reduce((n,l)=>{var i;const o=((i=l.taxonomy)==null?void 0:i.task)||"Unknown";return n[o]===void 0?(n[o]={name:o,value:1},n):(n[o].value+=1,n)},{}));return s.length===0?e.jsx(B,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(n=>{var l,o,i,m,x;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(te,{to:`/groups/${n.name}`,children:e.jsx("span",{className:"text-lg",children:n.display_name})}),e.jsx("span",{className:"block",children:n.name})]}),e.jsx("td",{children:((l=n.taxonomy)==null?void 0:l.task)||""}),e.jsx("td",{children:((o=n.taxonomy)==null?void 0:o.what)||""}),e.jsx("td",{children:((i=n.taxonomy)==null?void 0:i.who)||""}),e.jsx("td",{children:((m=n.taxonomy)==null?void 0:m.when)||""}),e.jsx("td",{children:((x=n.taxonomy)==null?void 0:x.language)||""}),e.jsx("td",{children:e.jsx(q,{value:n.description})})]})})})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(K,{className:"flex flex-col",children:[e.jsx(ce,{children:"Total scenarios"}),e.jsx(Te,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(K,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ke,{data:r.slice(0,Math.floor(r.length/2))}),e.jsx(ke,{data:r.slice(Math.ceil(r.length/2))})]})})]})]})]}))}function Ge(){return P(`${W()}/groups.json`)}async function fe(s){try{return await(await fetch(Ge(),{signal:s})).json()}catch(t){return console.log(t),[]}}function re({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function V({active:s=!1,onClick:t=()=>{},size:r="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${r} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function bt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),a.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const wt=a.forwardRef(bt),Ae=wt;function G(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Y({value:s,title:t,hideIcon:r}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const n=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const l="/runs/?q="+s.run_spec_names.map(i=>`^${i}$`).join("|");return encodeURI(l)}})();return n?e.jsx(te,{to:n,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}return s.href?e.jsx(te,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):s.markdown?e.jsx(q,{value:String(s.value)}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}function qe({groupsTables:s,activeGroup:t,ignoreHref:r=!1,sortable:n=!0,sortFirstMetric:l=!0}){const[o,i]=a.useState(l?1:void 0),[m,x]=a.useState({...s[t]}),[N,S]=a.useState(1);a.useEffect(()=>{x({...s[t]})},[t,s]);const k=b=>{let w=N;o===b?w=w*-1:w=1,i(b),S(w),x(f=>{const v={...f};return v.rows.sort((R,u)=>{var p,c;const y=(p=R[b])==null?void 0:p.value,d=(c=u[b])==null?void 0:c.value;return y!==void 0&&d===void 0?-1:d!==void 0&&y===void 0?1:typeof y=="number"&&typeof d=="number"?(y-d)*w:typeof y=="string"&&typeof d=="string"?w===1?y.localeCompare(d):d.localeCompare(y):0}),v})};return a.useEffect(()=>{l&&o&&k(o)},[l,o]),e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:m.header.map((b,w)=>e.jsx("th",{className:`${w===o?"bg-gray-100 ":"bg-white"} ${w===0?"left-0 z-10":""} whitespace-nowrap sticky top-0`,children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:b.value}),n?e.jsx("button",{className:"link",onClick:()=>k(w),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${w}`))})}),e.jsx("tbody",{children:m.rows.map((b,w)=>e.jsx("tr",{children:b.map((f,v)=>e.jsx("td",{className:`${v==0?"text-lg sticky left-0":""} ${o===v?"bg-gray-100":"bg-white"}`,children:e.jsx("div",{className:f&&f.style&&f.style["font-weight"]&&f.style["font-weight"]==="bold"?"font-bold":"",children:e.jsx(Y,{ignoreHref:r&&v===0,value:f})})},`${t}-${v}`))},`${t}-${w}`))})]})})}function yt(){const[s,t]=a.useState(0),[r,n]=a.useState([]),[l,o]=a.useState([]);return a.useEffect(()=>{const i=new AbortController;async function m(){const x=await fe(i.signal);o(x),n(x.map(N=>N.title))}return m(),()=>i.abort()},[]),l.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx(I,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("a",{className:"flex link-primary space-between items-center self-end link link-hover block",href:Ge(),download:"true",target:"_blank",children:[e.jsx(Oe,{className:"w-6 h-6 mr-2"})," JSON"]})]}),e.jsx("div",{children:e.jsx(re,{children:r.map((i,m)=>e.jsx(V,{onClick:()=>t(m),active:s===m,size:"lg",children:i},m))})}),e.jsx("div",{className:"mt-8",children:e.jsx(qe,{sortable:!1,groupsTables:l,activeGroup:s})})]})}async function pe(s,t){try{return await(await fetch(P(`${W()}/groups/${s}.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function je(s){try{return await(await fetch(P(`${W()}/groups_metadata.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Nt(){const{groupName:s}=Pe(),[t,r]=a.useState([]),[n,l]=a.useState(),[o,i]=a.useState(!0),[m,x]=a.useState(0);return a.useEffect(()=>{const N=new AbortController;async function S(){if(s===void 0)return;const[k,b]=await Promise.all([pe(s,N.signal),je(N.signal)]);r(k),l(b[s]),i(!1)}return S(),()=>N.abort()},[s]),o||n===void 0?e.jsx(B,{}):t.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8"}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex flex-row justify-between",children:e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8 mb-16"})}),e.jsx("div",{className:"overflow-x-auto",children:t.length>1?e.jsx(re,{children:t.map((N,S)=>e.jsx(V,{active:S===m,onClick:()=>x(S),children:N.title},S))}):null}),e.jsx(qe,{groupsTables:t,activeGroup:m,ignoreHref:!0})]})}async function Je(s){try{return await(await fetch(P(`${W()}/run_specs.json`),{signal:s})).json()}catch(t){return console.log(t),[]}}function me({currentPage:s,totalPages:t,onNextPage:r,onPrevPage:n,className:l}){let o="join";return l!==void 0&&(o=`join ${l}`),e.jsxs("div",{className:o,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:r,className:"join-item btn",children:"»"})]})}const le=100;function St(){const[s,t]=_e(),[r,n]=a.useState([]),[l,o]=a.useState(Number(s.get("page")||1)),[i,m]=a.useState(1),[x,N]=a.useState([]),[S,k]=a.useState(!0),[b,w]=a.useState(s.get("q")||"");a.useEffect(()=>{const u=new AbortController;async function y(){const d=await Je(u.signal);n(d),f(b,d)}return y(),()=>u.abort()},[b]),a.useEffect(()=>{f(b,r)},[r,b]);function f(u,y){const d=S?new RegExp(u):null,p=y.filter(c=>d?d.test(c.name):c.name.includes(u));N(p),m(Math.ceil(p.length/le))}const v=u=>{u.preventDefault();const d=u.target.q.value;w(d),t({q:d,page:"1"}),f(d,r)},R=x.slice((l-1)*le,l*le);return r.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:v,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:b,onChange:u=>w(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:S,onChange:()=>k(!S)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${x.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ot,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:R.map((u,y)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(te,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${y}`))})]})}),i>0?e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(l+1,i);o(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(l-1,1);o(u),s.set("page",String(u)),t(s)},currentPage:l,totalPages:i}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function F(){return window.SUITE!==void 0?window.SUITE:void 0}async function Et(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/instances.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Rt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/stats.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Mt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_requests.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function kt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_predictions.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Lt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/scenario.json`),{signal:t})).json()}catch(n){console.log(n);return}}function At(s,t){return P(`/runs/${t||F()}/${s}/run_spec.json`)}function Ct(s,t){return P(`/runs/${t||F()}/${s}/scenario_state.json`)}function ie({value:s}){return e.jsx(X,{icon:Gs,color:"green",children:s})}function oe({value:s}){return e.jsx(X,{icon:Ws,color:"red",children:s})}function Pt({stats:s}){if(s.quasi_exact_match!==void 0)return s.quasi_exact_match?e.jsx(ie,{value:`quasi exact match: ${s.quasi_exact_match}`}):e.jsx(oe,{value:`quasi exact match: ${s.quasi_exact_match}`});if(s.toxic_frac!==void 0)return s.toxic_frac>0?e.jsx(oe,{value:`toxic frac: ${s.toxic_frac}`}):e.jsx(ie,{value:`toxic frac: ${s.toxic_frac}`});if(s.exact_match!==void 0)return s.exact_match>0?e.jsx(ie,{value:`exact match: ${s.exact_match}`}):e.jsx(oe,{value:`exact match: ${s.exact_match}`})}function J({value:s}){const[t,r]=a.useState(!1),[n,l]=a.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>r(!0),onMouseOut:()=>r(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>l(!0),children:e.jsx(st,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>l(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function We({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=P(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ze({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(We,{mediaObject:t}))})}function _t(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Tt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(J,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ze,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(ue,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,r)=>e.jsxs(he,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:_t(s.request[t])}):"null"]},r+1))})]})}function $t({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,r])=>e.jsxs("div",{children:[e.jsx("h3",{children:e.jsx("strong",{children:t})}),r.map((n,l)=>e.jsxs("div",{children:[n.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(J,{value:n.error})," "]}),n.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(J,{value:n.text})," "]}),n.media_object&&e.jsx(We,{mediaObject:n.media_object})]},l))]},t)):null})}function Ut({predictions:s,requests:t,metricFieldMap:r}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,l)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",l]}):null,e.jsx("div",{className:"mt-2 w-full",children:n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(o=>e.jsx("img",{src:"data:image;base64,"+o,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Pt,{stats:n.stats})]}),e.jsx(J,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(J,{value:String(n.mapped_output)})]}):null]})}),e.jsx($t,{predictionAnnotations:n.annotations}),e.jsx("h3",{children:"Metrics"}),e.jsx(ue,{children:Object.keys(n.stats).map((o,i)=>e.jsxs(he,{children:[r[o]?e.jsx("span",{title:r[o].description,children:r[o].display_name}):e.jsx("span",{children:o}),e.jsx("span",{children:String(n.stats[o])})]},i))}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Tt,{request:t[l]})})]})]},l))})})}const It="correct";function Dt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,r)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(n=>e.jsx(X,{className:"mx-2",color:n===It?"green":void 0,children:n}))]},r))})]})}function Ht({instance:s,requests:t,predictions:r,metricFieldMap:n}){return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:`Instance id: ${s.id} [split: ${s.split}]`}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ze,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(J,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Dt,{references:s.references}):null}),e.jsx("div",{children:r&&t?e.jsx(Ut,{predictions:r,requests:t,metricFieldMap:n}):null})]})}function Ot({stat:s,metricFieldMap:t}){const r=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),r]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),r]})}function Ke(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Bt(s){try{return await(await fetch(P(`/releases/${Ke()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Ft(s,t){return Ke()?s[t]:window.SUITE}const ee=10,se=50;function zt(){const{runName:s}=Pe(),[t,r]=_e(),[n,l]=a.useState(0),[o,i]=a.useState(),[m,x]=a.useState(),[N,S]=a.useState([]),[k,b]=a.useState([]),[w,f]=a.useState(),[v,R]=a.useState(),[u,y]=a.useState(1),[d,p]=a.useState(1),[c,h]=a.useState(1),[j,_]=a.useState(1),[A,T]=a.useState(),[C,$]=a.useState(),[Z,Q]=a.useState({}),[be,ps]=a.useState({}),[we,js]=a.useState("");if(a.useEffect(()=>{const g=new AbortController;async function O(){const U=g.signal;if(s===void 0)return()=>g.abort();const z=window.SUITE?window.SUITE:Ft(await Bt(U),s);x(z);const[ye,Ne,Se,bs,ws,ys]=await Promise.all([Je(U),Et(s,U,z),Rt(s,U,z),Lt(s,U,z),kt(s,U,z),Mt(s,U,z)]);i(ye.find(M=>M.name===s)),S(Ne);const Ee=Math.ceil(Ne.length/ee),Ns=Number(t.get("instancesPage")||1);p(Ee),y(Math.max(Math.min(Ns,Ee),1)),b(Se),$(bs);const Re=Math.floor(Se.length/se),Ss=Number(t.get("metricsPage")||1);_(Re),h(Math.max(Math.min(Ss,Re),1)),f(ws.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{})),R(ys.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{}));const ae=await H(U);ps(ae.metrics.reduce((M,L)=>(M[L.name]=L,M),{})),Q(ae.adapter.reduce((M,L)=>(M[L.name]=L,M),{})),T(ae.models.find(M=>{var L;return M.name===((L=ye.find(Es=>Es.name===s))==null?void 0:L.adapter_spec.model)}))}return O(),()=>g.abort()},[s,t]),o===void 0||w===void 0||v===void 0||C===void 0)return e.jsx(B,{});const gs=N.slice((u-1)*ee,(u-1)*ee+ee),vs=k.slice((c-1)*se,(c-1)*se+se);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[C.name,e.jsx("a",{href:"/#/groups/"+C.name,children:e.jsx(Xs,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:C.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:o.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:(A==null?void 0:A.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:C.tags.map(g=>e.jsx(X,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:g})}))})]})}),e.jsxs(K,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Oe,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:At(o.name,m),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Ct(o.name,m),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(ue,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(o.adapter_spec).map(([g,O],U)=>e.jsxs(he,{className:U<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:Z[g]?Z[g].description:void 0,children:`${g}: `}),e.jsx("span",{className:"overflow-x-auto",children:O})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(re,{children:[e.jsx(V,{size:"lg",active:n===0,onClick:()=>l(0),children:"Instances + Predictions"}),e.jsx(V,{size:"lg",active:n===1,onClick:()=>l(1),children:"All metrics"})]})}),n===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:gs.map((g,O)=>e.jsx(Ht,{instance:g,requests:v[g.id],predictions:w[g.id],metricFieldMap:be},`${g.id}-${O}`))}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(u+1,d);y(g),t.set("instancesPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(u-1,1);y(g),t.set("instancesPage",String(g)),r(t)},currentPage:u,totalPages:d})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:g=>js(g.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(k[0]).map(g=>e.jsx("th",{children:g},g))})}),e.jsx("tbody",{children:vs.filter(g=>!we||g.name.name.toLowerCase().includes(we.toLowerCase())).map(g=>e.jsx("tr",{children:Object.entries(g).map(([O,U])=>O==="name"?e.jsx("td",{children:e.jsx(Ot,{stat:g,metricFieldMap:be})},O):e.jsx("td",{children:U}))}))})]})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(c+1,j);h(g),t.set("metricsPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(c-1,1);h(g),t.set("metricsPage",String(g)),r(t)},currentPage:c,totalPages:j})]})]})}function Gt({groupsTables:s,activeGroup:t,sortable:r=!0,sortFirstMetric:n=!0}){const[l,o]=a.useState(n?1:void 0),[i,m]=a.useState({...s[t]}),[x,N]=a.useState(1);function S(d){return d.length>30?d.substring(0,27)+"...":d}const k=d=>d.value==="Model/adapter"?"Model":d.value.includes("-book")?S(d.value.replace("-book","")):S(d.value),[b,w]=a.useState(void 0);a.useEffect(()=>{const d=new AbortController;async function p(){const c=await H(d.signal);w(c)}return p(),()=>d.abort()},[]);const f=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.description;return c.includes("/")&&(c=c.replace("/","_")),c}}return""},v=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.name;return c.includes("/")&&(c=c.replace("/","_")),c}}return""};function R(d){const p=d.lastIndexOf(" - ");return p===-1?d:d.substring(0,p)+"*"+d.substring(p+1)}const u=d=>{const c=R(d).split("*")[0].trim();if(b){const h=b.run_groups.find(j=>j.display_name===c||j.short_display_name===c);if(h)return h.name}return""};a.useEffect(()=>{m({...s[t]})},[t,s]);const y=d=>{let p=x;l===d?p=p*-1:p=1,o(d),N(p),m(c=>{const h={...c};return h.rows.sort((j,_)=>{var C,$;const A=(C=j[d])==null?void 0:C.value,T=($=_[d])==null?void 0:$.value;return A!==void 0&&T===void 0?-1:T!==void 0&&A===void 0?1:typeof A=="number"&&typeof T=="number"?(A-T)*p:typeof A=="string"&&typeof T=="string"?p===1?A.localeCompare(T):T.localeCompare(A):0}),h})};return a.useEffect(()=>{n&&l&&y(l)},[n,l]),e.jsx(e.Fragment,{children:e.jsx("div",{children:e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:i.header.map((d,p)=>e.jsx("th",{className:`${p===l?"bg-gray-100":"bg-white"} ${p===0?"left-0 z-10":""} ${d.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:d.description?d.description:"",children:e.jsxs("div",{className:"flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:k(d)}),r?e.jsx("button",{className:"link",onClick:()=>y(p),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${p}`))})}),e.jsx("tbody",{children:i.rows.map((d,p)=>e.jsx("tr",{children:d.map((c,h)=>e.jsx("td",{className:`${h===0?"text-lg sticky left-0":""} ${p%2===0?"bg-gray-50":"bg-white"}`,children:h==1?e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Y,{value:{...c,href:"/runs/?q="+v(String(d[0].value))},title:`Click value to see all predictions for: ${v(String(d[0].value))}`})}):e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""} ${h===0?"underline decoration-dashed decoration-gray-300":""}`,children:e.jsx(Y,{value:{...c},title:String(d[0].value)===c.value?f(String(d[0].value)):`Click value to see predictions for ${u(k(i.header[h]))}: ${v(String(d[0].value))}`})})},`${t}-${h}`))},`${t}-${p}`))})]})})})})}function qt(){const[s,t]=a.useState([]),[r,n]=a.useState(),[l,o]=a.useState([]),[i,m]=a.useState(),[x,N]=a.useState(!0),[S,k]=a.useState(0);function b(f,v){console.log(f,v);const R=f.find(u=>u.title===v);return R??f[0]}function w(f,v){n(b(f,v))}return a.useEffect(()=>{const f=new AbortController;async function v(){const R=await fe(f.signal),u=[];if(R.forEach(c=>{c.rows.forEach(h=>{u.push({title:String(h[0].value),name:h[0].href.replace("?group=","")})})}),t(u),u.length===0)throw new Error("Could not find any groups!");const y=r?r.name:u[0].name,[d,p]=await Promise.all([pe(y,f.signal),je(f.signal)]);o(d),m(p[y]),N(!1)}return v(),()=>f.abort()},[r]),x||i===void 0?e.jsx(B,{}):l.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:i.display_name,subtitle:i.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(I,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",value:r?r.title:s[0].title,onChange:f=>w(s,f.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:s.map((f,v)=>e.jsx("option",{value:f.title,children:f.title},v))})]})]}),e.jsx("div",{className:"overflow-x-auto",children:l.length>1?e.jsx(re,{children:l.map((f,v)=>e.jsx(V,{active:v===S,onClick:()=>k(v),children:f.title},v))}):null}),e.jsx(Gt,{groupsTables:l,activeGroup:S,ignoreHref:!0})]})})}const Jt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Wt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Jt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Wt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function ge({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,r)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},r):e.jsx(E,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},r)}))})]})}function ve({runGroups:s}){const t=new Map(s.filter(l=>l.metric_groups!==void 0&&(l.subgroups===void 0||l.subgroups.length===0)).map(l=>[l.name,l])),r=new Set,n=[];return s.forEach(l=>{const o=l.subgroups?l.subgroups:[],i=[];o.forEach(m=>{const x=t.get(m);x&&(i.push(x),r.add(x.name))}),i.length>0&&n.push([l,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," scenarios"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx(E,{className:"text-black",to:"groups/"+l.name,children:e.jsx("h2",{children:l.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(E,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},l.name))})]})}const Ve=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function Kt({groupsTables:s,activeGroup:t,sortFirstMetric:r=!0,filteredCols:n=[],modelsToFilter:l=[],numModelsToAutoFilter:o=0}){const[i,m]=a.useState(r?1:void 0),[x,N]=a.useState({...s[t]}),[S,k]=a.useState(1),[b,w]=a.useState(l);function f(c){return c.length>30?c.substring(0,27)+"...":c}const v=c=>c.value==="Model/adapter"?"Model":c.value.includes("-book")?f(c.value.replace("-book","")):f(c.value),[R,u]=a.useState(void 0);a.useEffect(()=>{const c=new AbortController;async function h(){const j=await H(c.signal);u(j)}return h(),()=>c.abort()},[]);const y=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.description;return j.includes("/")&&(j=j.replace("/","_")),j}}return""},d=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.name;return j.includes("/")&&(j=j.replace("/","_")),j}}return""};a.useEffect(()=>{if(N({...s[t]}),o){const _=s[0].rows.sort((A,T)=>Number(T[1].value)-Number(A[1].value)).slice(0,o).map(A=>String(A[0].value));w(_)}},[t,s,o]);const p=c=>{let h=S;i===c?h=h*-1:h=1,m(c),k(h),N(j=>{const _={...j};return _.rows.sort((A,T)=>{var Z,Q;const C=(Z=A[c])==null?void 0:Z.value,$=(Q=T[c])==null?void 0:Q.value;return C!==void 0&&$===void 0?-1:$!==void 0&&C===void 0?1:typeof C=="number"&&typeof $=="number"?(C-$)*h:typeof C=="string"&&typeof $=="string"?h===1?C.localeCompare($):$.localeCompare(C):0}),_})};return a.useEffect(()=>{r&&i&&p(i)},[r,i]),e.jsx(e.Fragment,{children:e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table w-full",children:[e.jsx("thead",{children:e.jsx("tr",{children:x.header.filter((c,h)=>n.length===0||n.includes(h)).map((c,h)=>e.jsx("th",{className:`${h===i?"bg-gray-100":""} ${c.description?"underline decoration-dashed":""} whitespace-nowrap px-4 `,title:c.description?c.description:"",children:e.jsx("div",{className:"flex gap-2 items-center",children:e.jsx("span",{children:v(c)})})},`${t}-${h}`))})}),e.jsx("tbody",{children:x.rows.filter(c=>b.includes(String(c[0].value))).map((c,h)=>e.jsx("tr",{className:`${h%2===0?"bg-gray-50":""}`,children:c.filter((j,_)=>n.length===0||n.includes(_)).map((j,_)=>e.jsx("td",{className:`${_===0?"text-lg":""}`,children:e.jsx("div",{className:j&&j.style&&j.style["font-weight"]&&j.style["font-weight"]==="bold"?"font-bold":"",children:_===0?e.jsx(Y,{value:{...j},title:y(String(c[0].value)),hideIcon:!0}):e.jsx(Y,{value:{...j,href:"/runs/?q="+d(String(c[0].value))},title:`Click value to see all predictions for: ${d(String(c[0].value))}`})})},`${t}-${_}`))},`${t}-${h}`))})]})})})})}function Ye({numModelsToAutoFilter:s=6}){const[t,r]=a.useState([]),[n,l]=a.useState([]),[o,i]=a.useState(),[m,x]=a.useState(!0),N=0;return console.log(t),a.useEffect(()=>{const S=new AbortController;async function k(){const b=await fe(S.signal),w=[];if(b.forEach(u=>{u.rows.forEach(y=>{w.push({title:String(y[0].value),name:y[0].href.replace("?group=","")})})}),r(w),w.length===0)throw new Error("Could not find any groups!");const f=w[0].name,[v,R]=await Promise.all([pe(f,S.signal),je(S.signal)]);l(v),i(R[f]),x(!1)}return k(),()=>S.abort()},[]),m||o===void 0?e.jsx(B,{}):n.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:o.display_name,subtitle:o.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsx(e.Fragment,{children:e.jsx(Kt,{groupsTables:n,activeGroup:N,numModelsToAutoFilter:s,filteredCols:[0,1]})})})}function Vt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(Ye,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(E,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Xe=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Qe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,es=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,ss=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,ts=""+new URL("cohere-3550c6cb.png",import.meta.url).href,ns=""+new URL("eleutherai-b9451114.png",import.meta.url).href,rs=""+new URL("google-06d997ad.png",import.meta.url).href,as=""+new URL("meta-5580e9f1.png",import.meta.url).href,ls=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,is=""+new URL("mistral-18e1be23.png",import.meta.url).href,os=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,cs=""+new URL("openai-3f8653e4.png",import.meta.url).href,ds=""+new URL("tii-24de195c.png",import.meta.url).href,ms=""+new URL("together-a665a35b.png",import.meta.url).href,us=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,hs="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",xs=""+new URL("yandex-38e09d70.png",import.meta.url).href,fs=""+new URL("01-694cb9b7.png",import.meta.url).href,Yt=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function Ce(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Vt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Yt.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})})]})]}):null}function Xt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-row md:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(Ye,{numModelsToAutoFilter:10})})]})]})}const Qt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function en({metricFieldMap:s,metricGroups:t}){const r=new Set,n=[];return t.forEach(l=>{const o=[];l.metrics.forEach(i=>{const m=s[i.name];m&&(o.push(m),r.add(m.name))}),o.length>0&&n.push([l,o])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," metrics"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:l.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},l.name))})]})}function sn(){const[s,t]=a.useState(void 0);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o)}return l(),()=>n.abort()},[]);const r=s?s.metrics.reduce((n,l)=>(n[l.name]=l,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Qt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&r?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups}),e.jsx(en,{metricFieldMap:r,metricGroups:s.metric_groups})]}):null]})}const tn=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,nn=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function rn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 6 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsx("img",{src:tn,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:"mx-auto lg:max-w-3xl block my-8"}),e.jsx("img",{src:nn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:"mx-auto lg:max-w-3xl block my-8"}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})]})}const an=({id:s,title:t,text:r})=>(t.includes("HE")||(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:xe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:r})]})})}));function ln(){const[s,t]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{t(r)}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-3 gap-4",children:s&&s.map((r,n)=>r.id==="home"?null:e.jsx(an,{id:r.id,title:r.title,text:r.description},n))})})}function on(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-36",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
10
+ mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const cn=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function dn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(on,{}),e.jsxs("div",{className:"container mt-30 mx-auto text-lg",children:[e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})}),e.jsx("div",{className:"flex flex-col sm:flex-row flex sm:gap-8 md:gap-32",children:e.jsx("body",{children:"HELM leaderboards leverage the HELM framework and target particular domains and/or capabilities. Leaderboards range from real world applications and specific domains to ones focused on multimodal capabilities and model-evaluations."})})]}),e.jsx(ln,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:cn.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]})})]}):null}function mn(){return window.PROJECT_ID==="lite"?e.jsx(Ce,{}):window.PROJECT_ID==="instruct"?e.jsx(Zt,{}):window.PROJECT_ID==="heim"?e.jsx(sn,{}):window.PROJECT_ID==="mmlu"?e.jsx(Xt,{}):window.PROJECT_ID==="vhelm"?e.jsx(rn,{}):window.PROJECT_ID==="home"?e.jsx(dn,{}):e.jsx(Ce,{})}function un(){return e.jsx(Ls,{children:e.jsx(As,{children:e.jsxs(D,{path:"/",element:e.jsx(ht,{}),children:[e.jsx(D,{index:!0,element:e.jsx(mn,{})}),e.jsx(D,{path:"leaderboard",element:e.jsx(qt,{})}),e.jsx(D,{path:"models",element:e.jsx(gt,{})}),e.jsx(D,{path:"scenarios",element:e.jsx(vt,{})}),e.jsx(D,{path:"groups",element:e.jsx(yt,{})}),e.jsx(D,{path:"groups/:groupName",element:e.jsx(Nt,{})}),e.jsx(D,{path:"runs",element:e.jsx(St,{})}),e.jsx(D,{path:"runs/:runName",element:e.jsx(zt,{})})]})})})}de.createRoot(document.getElementById("root")).render(e.jsx(Cs.StrictMode,{children:e.jsx(un,{})}));