crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/run_expander.py +35 -63
- helm/benchmark/run_spec_factory.py +11 -10
- helm/benchmark/run_specs/vlm_run_specs.py +294 -38
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +257 -10
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +36 -6
- helm/clients/openai_client.py +2 -3
- helm/clients/together_client.py +93 -2
- helm/clients/vertexai_client.py +59 -50
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +11 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/common/images_utils.py +10 -3
- helm/config/model_deployments.yaml +100 -2
- helm/config/model_metadata.yaml +136 -31
- helm/config/tokenizer_configs.yaml +7 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -192,6 +192,93 @@ metrics:
|
|
|
192
192
|
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
193
193
|
lower_is_better: false
|
|
194
194
|
|
|
195
|
+
- name: rouge_1
|
|
196
|
+
display_name: ROUGE-1
|
|
197
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
198
|
+
lower_is_better: false
|
|
199
|
+
- name: rouge_2
|
|
200
|
+
display_name: ROUGE-2
|
|
201
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
202
|
+
lower_is_better: false
|
|
203
|
+
- name: rouge_l
|
|
204
|
+
display_name: ROUGE-L
|
|
205
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
206
|
+
lower_is_better: false
|
|
207
|
+
- name: bleu_1
|
|
208
|
+
display_name: BLEU-1
|
|
209
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
210
|
+
lower_is_better: false
|
|
211
|
+
- name: bleu_4
|
|
212
|
+
display_name: BLEU-4
|
|
213
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
214
|
+
lower_is_better: false
|
|
215
|
+
- name: f1_score
|
|
216
|
+
display_name: F1
|
|
217
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
218
|
+
lower_is_better: false
|
|
219
|
+
- name: cider
|
|
220
|
+
display_name: CIDEr
|
|
221
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
222
|
+
lower_is_better: false
|
|
223
|
+
|
|
224
|
+
# Bias metrics:
|
|
225
|
+
- name: bias_metric:mode=associations,demographic_category=race,target_category=profession
|
|
226
|
+
display_name: Stereotypical associations (race, profession)
|
|
227
|
+
short_display_name: Stereotypes (race)
|
|
228
|
+
lower_is_better: true
|
|
229
|
+
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
230
|
+
- name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
|
|
231
|
+
display_name: Stereotypical associations (race, adjectives)
|
|
232
|
+
short_display_name: Stereotypes (race)
|
|
233
|
+
lower_is_better: true
|
|
234
|
+
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
235
|
+
|
|
236
|
+
- name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
|
|
237
|
+
display_name: Stereotypical associations (gender, profession)
|
|
238
|
+
short_display_name: Stereotypes (gender)
|
|
239
|
+
lower_is_better: true
|
|
240
|
+
description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
241
|
+
- name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
|
|
242
|
+
display_name: Stereotypical associations (gender, adjectives)
|
|
243
|
+
short_display_name: Stereotypes (gender)
|
|
244
|
+
lower_is_better: true
|
|
245
|
+
description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
246
|
+
|
|
247
|
+
- name: bias_metric:mode=representation,demographic_category=race
|
|
248
|
+
display_name: Demographic representation (race)
|
|
249
|
+
short_display_name: Representation (race)
|
|
250
|
+
lower_is_better: true
|
|
251
|
+
description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
|
|
252
|
+
- name: bias_metric:mode=representation,demographic_category=gender
|
|
253
|
+
display_name: Demographic representation (gender)
|
|
254
|
+
short_display_name: Representation (gender)
|
|
255
|
+
lower_is_better: true
|
|
256
|
+
description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
257
|
+
|
|
258
|
+
- name: bbq_metric_ambiguous_bias
|
|
259
|
+
display_name: BBQ (ambiguous)
|
|
260
|
+
lower_is_better: true
|
|
261
|
+
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
|
|
262
|
+
- name: bbq_metric_unambiguous_bias
|
|
263
|
+
display_name: BBQ (unambiguous)
|
|
264
|
+
lower_is_better: true
|
|
265
|
+
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
|
|
266
|
+
|
|
267
|
+
# Toxicity metrics
|
|
268
|
+
- name: expected_max_toxicity
|
|
269
|
+
display_name: Average max toxicity
|
|
270
|
+
short_display_name: Avg. max toxicity
|
|
271
|
+
lower_is_better: true
|
|
272
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
273
|
+
- name: max_toxicity_probability
|
|
274
|
+
display_name: max toxicity
|
|
275
|
+
lower_is_better: true
|
|
276
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
277
|
+
- name: toxic_frac
|
|
278
|
+
display_name: Toxic fraction
|
|
279
|
+
lower_is_better: true
|
|
280
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
281
|
+
|
|
195
282
|
############################################################
|
|
196
283
|
perturbations:
|
|
197
284
|
- name: robustness
|
|
@@ -255,6 +342,12 @@ metric_groups:
|
|
|
255
342
|
- name: num_output_tokens
|
|
256
343
|
split: ${main_split}
|
|
257
344
|
|
|
345
|
+
- name: toxicity
|
|
346
|
+
display_name: Toxicity
|
|
347
|
+
metrics:
|
|
348
|
+
- name: toxic_frac
|
|
349
|
+
split: ${main_split}
|
|
350
|
+
|
|
258
351
|
- name: generation_image
|
|
259
352
|
display_name: Generation (image)
|
|
260
353
|
metrics:
|
|
@@ -293,6 +386,74 @@ run_groups:
|
|
|
293
386
|
- seed_bench
|
|
294
387
|
- mme
|
|
295
388
|
|
|
389
|
+
- name: a_okvqa
|
|
390
|
+
display_name: A-OKVQA
|
|
391
|
+
description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
|
|
392
|
+
metric_groups:
|
|
393
|
+
- accuracy
|
|
394
|
+
- efficiency
|
|
395
|
+
- general_information
|
|
396
|
+
environment:
|
|
397
|
+
main_name: exact_match
|
|
398
|
+
main_split: valid
|
|
399
|
+
taxonomy:
|
|
400
|
+
task: multiple-choice question answering
|
|
401
|
+
what: Real-world images
|
|
402
|
+
who: Human experts
|
|
403
|
+
when: "2023"
|
|
404
|
+
language: English
|
|
405
|
+
|
|
406
|
+
- name: crossmodal_3600
|
|
407
|
+
display_name: Crossmodal 3600
|
|
408
|
+
description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
|
|
409
|
+
metric_groups:
|
|
410
|
+
- accuracy
|
|
411
|
+
- efficiency
|
|
412
|
+
- general_information
|
|
413
|
+
environment:
|
|
414
|
+
main_name: f1_score
|
|
415
|
+
main_split: test
|
|
416
|
+
taxonomy:
|
|
417
|
+
task: multilingual captioning
|
|
418
|
+
what: Real-world images
|
|
419
|
+
who: Human experts
|
|
420
|
+
when: "2022"
|
|
421
|
+
language: 36 languages
|
|
422
|
+
|
|
423
|
+
- name: flickr30k
|
|
424
|
+
display_name: Flickr30k
|
|
425
|
+
description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
|
|
426
|
+
metric_groups:
|
|
427
|
+
- accuracy
|
|
428
|
+
- efficiency
|
|
429
|
+
- general_information
|
|
430
|
+
environment:
|
|
431
|
+
main_name: f1_score
|
|
432
|
+
main_split: test
|
|
433
|
+
taxonomy:
|
|
434
|
+
task: image captioning
|
|
435
|
+
what: Flickr images
|
|
436
|
+
who: Human experts
|
|
437
|
+
when: "2014"
|
|
438
|
+
language: English
|
|
439
|
+
|
|
440
|
+
- name: gqa
|
|
441
|
+
display_name: GQA
|
|
442
|
+
description: Questions about real-world visual reasoning and compositional QA
|
|
443
|
+
metric_groups:
|
|
444
|
+
- accuracy
|
|
445
|
+
- efficiency
|
|
446
|
+
- general_information
|
|
447
|
+
environment:
|
|
448
|
+
main_name: f1_score
|
|
449
|
+
main_split: valid
|
|
450
|
+
taxonomy:
|
|
451
|
+
task: short answer question answering
|
|
452
|
+
what: Real-world images
|
|
453
|
+
who: Human experts
|
|
454
|
+
when: "2019"
|
|
455
|
+
language: English
|
|
456
|
+
|
|
296
457
|
- name: heim_human_eval
|
|
297
458
|
display_name: HEIM Human Eval Scenario
|
|
298
459
|
description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
|
|
@@ -336,26 +497,78 @@ run_groups:
|
|
|
336
497
|
when: "2020"
|
|
337
498
|
language: English
|
|
338
499
|
|
|
500
|
+
- name: mm_safety_bench
|
|
501
|
+
display_name: MM-SafetyBench
|
|
502
|
+
description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
|
|
503
|
+
metric_groups:
|
|
504
|
+
- accuracy
|
|
505
|
+
- efficiency
|
|
506
|
+
- toxicity
|
|
507
|
+
- general_information
|
|
508
|
+
environment:
|
|
509
|
+
main_name: toxic_frac
|
|
510
|
+
main_split: test
|
|
511
|
+
taxonomy:
|
|
512
|
+
task: safety
|
|
513
|
+
what: safety images
|
|
514
|
+
who: Human experts
|
|
515
|
+
when: "2023"
|
|
516
|
+
language: English
|
|
517
|
+
|
|
518
|
+
- name: mscoco_captioning
|
|
519
|
+
display_name: MSCOCO (captioning)
|
|
520
|
+
description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
521
|
+
metric_groups:
|
|
522
|
+
- accuracy
|
|
523
|
+
- efficiency
|
|
524
|
+
- general_information
|
|
525
|
+
environment:
|
|
526
|
+
main_name: f1_score
|
|
527
|
+
main_split: valid
|
|
528
|
+
taxonomy:
|
|
529
|
+
task: image captioning
|
|
530
|
+
what: Real world images
|
|
531
|
+
who: Human experts
|
|
532
|
+
when: "2014"
|
|
533
|
+
language: English
|
|
534
|
+
|
|
535
|
+
- name: mscoco_categorization
|
|
536
|
+
display_name: MSCOCO (categorization)
|
|
537
|
+
description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
538
|
+
metric_groups:
|
|
539
|
+
- accuracy
|
|
540
|
+
- efficiency
|
|
541
|
+
- general_information
|
|
542
|
+
environment:
|
|
543
|
+
main_name: exact_match
|
|
544
|
+
main_split: valid
|
|
545
|
+
taxonomy:
|
|
546
|
+
task: image captioning
|
|
547
|
+
what: Real world images
|
|
548
|
+
who: Human experts
|
|
549
|
+
when: "2014"
|
|
550
|
+
language: English
|
|
551
|
+
|
|
339
552
|
- name: viz_wiz
|
|
340
553
|
display_name: VizWiz
|
|
341
|
-
description:
|
|
554
|
+
description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
|
|
342
555
|
metric_groups:
|
|
343
556
|
- accuracy
|
|
344
557
|
- efficiency
|
|
345
558
|
- general_information
|
|
346
559
|
environment:
|
|
347
|
-
main_name:
|
|
348
|
-
main_split:
|
|
560
|
+
main_name: f1_score
|
|
561
|
+
main_split: valid
|
|
349
562
|
taxonomy:
|
|
350
|
-
task: multimodal question answering
|
|
351
|
-
what: images
|
|
352
|
-
who:
|
|
563
|
+
task: multimodal short answer question answering
|
|
564
|
+
what: Real-world images
|
|
565
|
+
who: Visually impaired people
|
|
353
566
|
when: "2018"
|
|
354
567
|
language: English
|
|
355
568
|
|
|
356
569
|
- name: vqa
|
|
357
570
|
display_name: VQAv2
|
|
358
|
-
description: Open-ended questions about images
|
|
571
|
+
description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
|
|
359
572
|
metric_groups:
|
|
360
573
|
- accuracy
|
|
361
574
|
- efficiency
|
|
@@ -364,15 +577,32 @@ run_groups:
|
|
|
364
577
|
main_name: f1_score
|
|
365
578
|
main_split: valid
|
|
366
579
|
taxonomy:
|
|
367
|
-
task: short answer question answering
|
|
580
|
+
task: multimodal short answer question answering
|
|
368
581
|
what: Real-world images
|
|
369
582
|
who: Human experts
|
|
370
583
|
when: "2017"
|
|
371
584
|
language: English
|
|
372
585
|
|
|
586
|
+
- name: math_vista
|
|
587
|
+
display_name: MathVista
|
|
588
|
+
description: Evaluating Math Reasoning in Visual Contexts
|
|
589
|
+
metric_groups:
|
|
590
|
+
- accuracy
|
|
591
|
+
- efficiency
|
|
592
|
+
- general_information
|
|
593
|
+
environment:
|
|
594
|
+
main_name: exact_match
|
|
595
|
+
main_split: test
|
|
596
|
+
taxonomy:
|
|
597
|
+
task: multiple-choice question answering
|
|
598
|
+
what: Evaluating Math Reasoning in Visual Contexts
|
|
599
|
+
who: Human experts
|
|
600
|
+
when: "2024"
|
|
601
|
+
language: English
|
|
602
|
+
|
|
373
603
|
- name: mmmu
|
|
374
604
|
display_name: MMMU
|
|
375
|
-
description: A
|
|
605
|
+
description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
|
|
376
606
|
metric_groups:
|
|
377
607
|
- accuracy
|
|
378
608
|
- efficiency
|
|
@@ -381,7 +611,7 @@ run_groups:
|
|
|
381
611
|
main_name: exact_match
|
|
382
612
|
main_split: valid
|
|
383
613
|
taxonomy:
|
|
384
|
-
task: multiple-choice question answering
|
|
614
|
+
task: multimodal multiple-choice question answering
|
|
385
615
|
what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
|
|
386
616
|
who: Human experts
|
|
387
617
|
when: "2023"
|
|
@@ -574,3 +804,20 @@ run_groups:
|
|
|
574
804
|
who: n/a
|
|
575
805
|
when: "2024"
|
|
576
806
|
language: English
|
|
807
|
+
|
|
808
|
+
- name: pairs
|
|
809
|
+
display_name: PAIRS
|
|
810
|
+
description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
|
|
811
|
+
metric_groups:
|
|
812
|
+
- accuracy
|
|
813
|
+
- efficiency
|
|
814
|
+
- general_information
|
|
815
|
+
environment:
|
|
816
|
+
main_name: exact_match
|
|
817
|
+
main_split: test
|
|
818
|
+
taxonomy:
|
|
819
|
+
task: multiple-choice question answering
|
|
820
|
+
what: Bias
|
|
821
|
+
who: Human experts
|
|
822
|
+
when: "2024"
|
|
823
|
+
language: English
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import{r as a,a as Rs,L as E,O as Ms,d as ks,u as Pe,f as _e,H as Ls,h as As,i as D,R as Cs}from"./react-d4a0b69b.js";import{g as X,b as K,m as ce,s as Te,a as Ps,d as Me,y as _s,c as ke,e as ue,l as he}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const l of document.querySelectorAll('link[rel="modulepreload"]'))n(l);new MutationObserver(l=>{for(const o of l)if(o.type==="childList")for(const i of o.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&n(i)}).observe(document,{childList:!0,subtree:!0});function r(l){const o={};return l.integrity&&(o.integrity=l.integrity),l.referrerPolicy&&(o.referrerPolicy=l.referrerPolicy),l.crossOrigin==="use-credentials"?o.credentials="include":l.crossOrigin==="anonymous"?o.credentials="omit":o.credentials="same-origin",o}function n(l){if(l.ep)return;l.ep=!0;const o=r(l);fetch(l.href,o)}})();var $e={exports:{}},ne={};/**
|
|
2
|
+
* @license React
|
|
3
|
+
* react-jsx-runtime.production.min.js
|
|
4
|
+
*
|
|
5
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
6
|
+
*
|
|
7
|
+
* This source code is licensed under the MIT license found in the
|
|
8
|
+
* LICENSE file in the root directory of this source tree.
|
|
9
|
+
*/var Ts=a,$s=Symbol.for("react.element"),Us=Symbol.for("react.fragment"),Is=Object.prototype.hasOwnProperty,Ds=Ts.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Hs={key:!0,ref:!0,__self:!0,__source:!0};function Ue(s,t,r){var n,l={},o=null,i=null;r!==void 0&&(o=""+r),t.key!==void 0&&(o=""+t.key),t.ref!==void 0&&(i=t.ref);for(n in t)Is.call(t,n)&&!Hs.hasOwnProperty(n)&&(l[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)l[n]===void 0&&(l[n]=t[n]);return{$$typeof:$s,type:s,key:o,ref:i,props:l,_owner:Ds.current}}ne.Fragment=Us;ne.jsx=Ue;ne.jsxs=Ue;$e.exports=ne;var e=$e.exports,de={},Le=Rs;de.createRoot=Le.createRoot,de.hydrateRoot=Le.hydrateRoot;function Os({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Bs=a.forwardRef(Os),Ie=Bs;function Fs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const zs=a.forwardRef(Fs),Gs=zs;function qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Js=a.forwardRef(qs),Ws=Js,De=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,He=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Zs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Ks=a.forwardRef(Zs),Oe=Ks;function Vs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Ys=a.forwardRef(Vs),Xs=Ys;function Qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const et=a.forwardRef(Qs),st=et;function tt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const nt=a.forwardRef(tt),Be=nt;function rt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const at=a.forwardRef(rt),Fe=at;function lt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=a.forwardRef(lt),ot=it;function xe(s,t){return t?s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function ze(){const[s,t]=a.useState([]),[r,n]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(l=>l.json()).then(l=>{if(t(l),window.PROJECT_ID){const o=l.find(i=>i.id===window.PROJECT_ID);n(o)}else{const o=l.find(i=>i.id==="lite");n(o)}}).catch(l=>{console.error("Error fetching JSON:",l)})},[]),r===void 0||r.title===void 0?null:e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[r.title," ",e.jsx(Be,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((l,o)=>e.jsx("li",{children:e.jsxs("a",{href:xe(void 0,l.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:r.title===l.title?"underline":"",children:l.title}),": ",l.description]})},o))})]})}function P(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function W(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ct(s){try{return await(await fetch(P(`${W()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=a.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[r,n]=a.useState();a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(m=>m.json()).then(m=>{if(window.PROJECT_ID){const x=m.find(N=>N.id===window.PROJECT_ID);n(x)}else{const x=m.find(N=>N.id==="lite");n(x)}}).catch(m=>{console.error("Error fetching JSON:",m)})},[]);function l(){return r!==void 0&&r.releases!==void 0?r.releases:["v1.0.0"]}a.useEffect(()=>{const m=new AbortController;async function x(){const N=await ct(m.signal);t(N)}return x(),()=>m.abort()},[]);const o=l();if(!s.release&&!s.suite)return null;const i=`Release ${s.release||s.suite} (${s.date})`;return o.length<=1?e.jsx("div",{children:i}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",e.jsx(Be,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:o.map(m=>e.jsx("li",{children:e.jsx("a",{href:xe(m,r?r.id:"lite"),className:"block",role:"menuitem",children:m})}))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]})]})}function ht(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="global"?e.jsx(ut,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Ms,{})})})]})}async function H(s){try{return await(await fetch(P(`${W()}/schema.json`),{signal:s})).json()}catch(t){return console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function xt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function q({value:s}){return e.jsx("span",{children:e.jsx(ks,{components:{a:xt},children:s})})}function I({title:s,subtitle:t,markdown:r=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),r&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(q,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},pt={open:"Open",limited:"Limited",closed:"Closed"};function jt({level:s}){return e.jsx(X,{color:ft[s],children:pt[s]})}function B(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function gt(){const[s,t]=a.useState([]);a.useEffect(()=>{const i=new AbortController;async function m(){const x=await H(i.signal);t(x.models)}return m(),()=>i.abort()},[]);const[r,n,l]=s.reduce((i,m)=>{switch(m.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),o=Object.values(s.reduce((i,m)=>{const x=m.creator_organization;return i[x]===void 0?(i[x]={name:x,models:1},i):(i[x].models+=1,i)},{}));return s.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(q,{value:i.description})}),e.jsx("td",{children:e.jsx(jt,{level:i.access})})]}))})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(K,{className:"flex flex-col justify-between",children:[e.jsx(ce,{children:"Models"}),e.jsx(Te,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ps,{values:[r,n,l],colors:["green","yellow","red"]}),e.jsx(Me,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(K,{className:"md:col-span-2",children:[e.jsx(ce,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(_s,{data:o,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(Me,{categories:o.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function te({to:s,children:t,inTable:r=!1,title:n=""}){return r?e.jsx(E,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(E,{className:"link link-primary link-hover",to:s,children:t})}function vt(){const[s,t]=a.useState([]);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o.run_groups.filter(i=>!i.todo&&i.taxonomy&&!i.display_name.includes("CLEVA")))}return l(),()=>n.abort()},[]);const r=Object.values(s.reduce((n,l)=>{var i;const o=((i=l.taxonomy)==null?void 0:i.task)||"Unknown";return n[o]===void 0?(n[o]={name:o,value:1},n):(n[o].value+=1,n)},{}));return s.length===0?e.jsx(B,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(n=>{var l,o,i,m,x;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(te,{to:`/groups/${n.name}`,children:e.jsx("span",{className:"text-lg",children:n.display_name})}),e.jsx("span",{className:"block",children:n.name})]}),e.jsx("td",{children:((l=n.taxonomy)==null?void 0:l.task)||""}),e.jsx("td",{children:((o=n.taxonomy)==null?void 0:o.what)||""}),e.jsx("td",{children:((i=n.taxonomy)==null?void 0:i.who)||""}),e.jsx("td",{children:((m=n.taxonomy)==null?void 0:m.when)||""}),e.jsx("td",{children:((x=n.taxonomy)==null?void 0:x.language)||""}),e.jsx("td",{children:e.jsx(q,{value:n.description})})]})})})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(K,{className:"flex flex-col",children:[e.jsx(ce,{children:"Total scenarios"}),e.jsx(Te,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(K,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ke,{data:r.slice(0,Math.floor(r.length/2))}),e.jsx(ke,{data:r.slice(Math.ceil(r.length/2))})]})})]})]})]}))}function Ge(){return P(`${W()}/groups.json`)}async function fe(s){try{return await(await fetch(Ge(),{signal:s})).json()}catch(t){return console.log(t),[]}}function re({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function V({active:s=!1,onClick:t=()=>{},size:r="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${r} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function bt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),a.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const wt=a.forwardRef(bt),Ae=wt;function G(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Y({value:s,title:t,hideIcon:r}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const n=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const l="/runs/?q="+s.run_spec_names.map(i=>`^${i}$`).join("|");return encodeURI(l)}})();return n?e.jsx(te,{to:n,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}return s.href?e.jsx(te,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):s.markdown?e.jsx(q,{value:String(s.value)}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}function qe({groupsTables:s,activeGroup:t,ignoreHref:r=!1,sortable:n=!0,sortFirstMetric:l=!0}){const[o,i]=a.useState(l?1:void 0),[m,x]=a.useState({...s[t]}),[N,S]=a.useState(1);a.useEffect(()=>{x({...s[t]})},[t,s]);const k=b=>{let w=N;o===b?w=w*-1:w=1,i(b),S(w),x(f=>{const v={...f};return v.rows.sort((R,u)=>{var p,c;const y=(p=R[b])==null?void 0:p.value,d=(c=u[b])==null?void 0:c.value;return y!==void 0&&d===void 0?-1:d!==void 0&&y===void 0?1:typeof y=="number"&&typeof d=="number"?(y-d)*w:typeof y=="string"&&typeof d=="string"?w===1?y.localeCompare(d):d.localeCompare(y):0}),v})};return a.useEffect(()=>{l&&o&&k(o)},[l,o]),e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:m.header.map((b,w)=>e.jsx("th",{className:`${w===o?"bg-gray-100 ":"bg-white"} ${w===0?"left-0 z-10":""} whitespace-nowrap sticky top-0`,children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:b.value}),n?e.jsx("button",{className:"link",onClick:()=>k(w),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${w}`))})}),e.jsx("tbody",{children:m.rows.map((b,w)=>e.jsx("tr",{children:b.map((f,v)=>e.jsx("td",{className:`${v==0?"text-lg sticky left-0":""} ${o===v?"bg-gray-100":"bg-white"}`,children:e.jsx("div",{className:f&&f.style&&f.style["font-weight"]&&f.style["font-weight"]==="bold"?"font-bold":"",children:e.jsx(Y,{ignoreHref:r&&v===0,value:f})})},`${t}-${v}`))},`${t}-${w}`))})]})})}function yt(){const[s,t]=a.useState(0),[r,n]=a.useState([]),[l,o]=a.useState([]);return a.useEffect(()=>{const i=new AbortController;async function m(){const x=await fe(i.signal);o(x),n(x.map(N=>N.title))}return m(),()=>i.abort()},[]),l.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx(I,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("a",{className:"flex link-primary space-between items-center self-end link link-hover block",href:Ge(),download:"true",target:"_blank",children:[e.jsx(Oe,{className:"w-6 h-6 mr-2"})," JSON"]})]}),e.jsx("div",{children:e.jsx(re,{children:r.map((i,m)=>e.jsx(V,{onClick:()=>t(m),active:s===m,size:"lg",children:i},m))})}),e.jsx("div",{className:"mt-8",children:e.jsx(qe,{sortable:!1,groupsTables:l,activeGroup:s})})]})}async function pe(s,t){try{return await(await fetch(P(`${W()}/groups/${s}.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function je(s){try{return await(await fetch(P(`${W()}/groups_metadata.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Nt(){const{groupName:s}=Pe(),[t,r]=a.useState([]),[n,l]=a.useState(),[o,i]=a.useState(!0),[m,x]=a.useState(0);return a.useEffect(()=>{const N=new AbortController;async function S(){if(s===void 0)return;const[k,b]=await Promise.all([pe(s,N.signal),je(N.signal)]);r(k),l(b[s]),i(!1)}return S(),()=>N.abort()},[s]),o||n===void 0?e.jsx(B,{}):t.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8"}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex flex-row justify-between",children:e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8 mb-16"})}),e.jsx("div",{className:"overflow-x-auto",children:t.length>1?e.jsx(re,{children:t.map((N,S)=>e.jsx(V,{active:S===m,onClick:()=>x(S),children:N.title},S))}):null}),e.jsx(qe,{groupsTables:t,activeGroup:m,ignoreHref:!0})]})}async function Je(s){try{return await(await fetch(P(`${W()}/run_specs.json`),{signal:s})).json()}catch(t){return console.log(t),[]}}function me({currentPage:s,totalPages:t,onNextPage:r,onPrevPage:n,className:l}){let o="join";return l!==void 0&&(o=`join ${l}`),e.jsxs("div",{className:o,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:r,className:"join-item btn",children:"»"})]})}const le=100;function St(){const[s,t]=_e(),[r,n]=a.useState([]),[l,o]=a.useState(Number(s.get("page")||1)),[i,m]=a.useState(1),[x,N]=a.useState([]),[S,k]=a.useState(!0),[b,w]=a.useState(s.get("q")||"");a.useEffect(()=>{const u=new AbortController;async function y(){const d=await Je(u.signal);n(d),f(b,d)}return y(),()=>u.abort()},[b]),a.useEffect(()=>{f(b,r)},[r,b]);function f(u,y){const d=S?new RegExp(u):null,p=y.filter(c=>d?d.test(c.name):c.name.includes(u));N(p),m(Math.ceil(p.length/le))}const v=u=>{u.preventDefault();const d=u.target.q.value;w(d),t({q:d,page:"1"}),f(d,r)},R=x.slice((l-1)*le,l*le);return r.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:v,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:b,onChange:u=>w(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:S,onChange:()=>k(!S)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${x.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ot,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:R.map((u,y)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(te,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${y}`))})]})}),i>0?e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(l+1,i);o(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(l-1,1);o(u),s.set("page",String(u)),t(s)},currentPage:l,totalPages:i}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function F(){return window.SUITE!==void 0?window.SUITE:void 0}async function Et(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/instances.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Rt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/stats.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Mt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_requests.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function kt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_predictions.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Lt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/scenario.json`),{signal:t})).json()}catch(n){console.log(n);return}}function At(s,t){return P(`/runs/${t||F()}/${s}/run_spec.json`)}function Ct(s,t){return P(`/runs/${t||F()}/${s}/scenario_state.json`)}function ie({value:s}){return e.jsx(X,{icon:Gs,color:"green",children:s})}function oe({value:s}){return e.jsx(X,{icon:Ws,color:"red",children:s})}function Pt({stats:s}){if(s.quasi_exact_match!==void 0)return s.quasi_exact_match?e.jsx(ie,{value:`quasi exact match: ${s.quasi_exact_match}`}):e.jsx(oe,{value:`quasi exact match: ${s.quasi_exact_match}`});if(s.toxic_frac!==void 0)return s.toxic_frac>0?e.jsx(oe,{value:`toxic frac: ${s.toxic_frac}`}):e.jsx(ie,{value:`toxic frac: ${s.toxic_frac}`});if(s.exact_match!==void 0)return s.exact_match>0?e.jsx(ie,{value:`exact match: ${s.exact_match}`}):e.jsx(oe,{value:`exact match: ${s.exact_match}`})}function J({value:s}){const[t,r]=a.useState(!1),[n,l]=a.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>r(!0),onMouseOut:()=>r(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>l(!0),children:e.jsx(st,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>l(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function We({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=P(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ze({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(We,{mediaObject:t}))})}function _t(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Tt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(J,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ze,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(ue,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,r)=>e.jsxs(he,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:_t(s.request[t])}):"null"]},r+1))})]})}function $t({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,r])=>e.jsxs("div",{children:[e.jsx("h3",{children:e.jsx("strong",{children:t})}),r.map((n,l)=>e.jsxs("div",{children:[n.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(J,{value:n.error})," "]}),n.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(J,{value:n.text})," "]}),n.media_object&&e.jsx(We,{mediaObject:n.media_object})]},l))]},t)):null})}function Ut({predictions:s,requests:t,metricFieldMap:r}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,l)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",l]}):null,e.jsx("div",{className:"mt-2 w-full",children:n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(o=>e.jsx("img",{src:"data:image;base64,"+o,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Pt,{stats:n.stats})]}),e.jsx(J,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(J,{value:String(n.mapped_output)})]}):null]})}),e.jsx($t,{predictionAnnotations:n.annotations}),e.jsx("h3",{children:"Metrics"}),e.jsx(ue,{children:Object.keys(n.stats).map((o,i)=>e.jsxs(he,{children:[r[o]?e.jsx("span",{title:r[o].description,children:r[o].display_name}):e.jsx("span",{children:o}),e.jsx("span",{children:String(n.stats[o])})]},i))}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Tt,{request:t[l]})})]})]},l))})})}const It="correct";function Dt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,r)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(n=>e.jsx(X,{className:"mx-2",color:n===It?"green":void 0,children:n}))]},r))})]})}function Ht({instance:s,requests:t,predictions:r,metricFieldMap:n}){return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:`Instance id: ${s.id} [split: ${s.split}]`}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ze,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(J,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Dt,{references:s.references}):null}),e.jsx("div",{children:r&&t?e.jsx(Ut,{predictions:r,requests:t,metricFieldMap:n}):null})]})}function Ot({stat:s,metricFieldMap:t}){const r=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),r]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),r]})}function Ke(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Bt(s){try{return await(await fetch(P(`/releases/${Ke()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Ft(s,t){return Ke()?s[t]:window.SUITE}const ee=10,se=50;function zt(){const{runName:s}=Pe(),[t,r]=_e(),[n,l]=a.useState(0),[o,i]=a.useState(),[m,x]=a.useState(),[N,S]=a.useState([]),[k,b]=a.useState([]),[w,f]=a.useState(),[v,R]=a.useState(),[u,y]=a.useState(1),[d,p]=a.useState(1),[c,h]=a.useState(1),[j,_]=a.useState(1),[A,T]=a.useState(),[C,$]=a.useState(),[Z,Q]=a.useState({}),[be,ps]=a.useState({}),[we,js]=a.useState("");if(a.useEffect(()=>{const g=new AbortController;async function O(){const U=g.signal;if(s===void 0)return()=>g.abort();const z=window.SUITE?window.SUITE:Ft(await Bt(U),s);x(z);const[ye,Ne,Se,bs,ws,ys]=await Promise.all([Je(U),Et(s,U,z),Rt(s,U,z),Lt(s,U,z),kt(s,U,z),Mt(s,U,z)]);i(ye.find(M=>M.name===s)),S(Ne);const Ee=Math.ceil(Ne.length/ee),Ns=Number(t.get("instancesPage")||1);p(Ee),y(Math.max(Math.min(Ns,Ee),1)),b(Se),$(bs);const Re=Math.floor(Se.length/se),Ss=Number(t.get("metricsPage")||1);_(Re),h(Math.max(Math.min(Ss,Re),1)),f(ws.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{})),R(ys.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{}));const ae=await H(U);ps(ae.metrics.reduce((M,L)=>(M[L.name]=L,M),{})),Q(ae.adapter.reduce((M,L)=>(M[L.name]=L,M),{})),T(ae.models.find(M=>{var L;return M.name===((L=ye.find(Es=>Es.name===s))==null?void 0:L.adapter_spec.model)}))}return O(),()=>g.abort()},[s,t]),o===void 0||w===void 0||v===void 0||C===void 0)return e.jsx(B,{});const gs=N.slice((u-1)*ee,(u-1)*ee+ee),vs=k.slice((c-1)*se,(c-1)*se+se);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[C.name,e.jsx("a",{href:"/#/groups/"+C.name,children:e.jsx(Xs,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:C.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:o.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:(A==null?void 0:A.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:C.tags.map(g=>e.jsx(X,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:g})}))})]})}),e.jsxs(K,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Oe,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:At(o.name,m),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Ct(o.name,m),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(ue,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(o.adapter_spec).map(([g,O],U)=>e.jsxs(he,{className:U<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:Z[g]?Z[g].description:void 0,children:`${g}: `}),e.jsx("span",{className:"overflow-x-auto",children:O})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(re,{children:[e.jsx(V,{size:"lg",active:n===0,onClick:()=>l(0),children:"Instances + Predictions"}),e.jsx(V,{size:"lg",active:n===1,onClick:()=>l(1),children:"All metrics"})]})}),n===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:gs.map((g,O)=>e.jsx(Ht,{instance:g,requests:v[g.id],predictions:w[g.id],metricFieldMap:be},`${g.id}-${O}`))}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(u+1,d);y(g),t.set("instancesPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(u-1,1);y(g),t.set("instancesPage",String(g)),r(t)},currentPage:u,totalPages:d})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:g=>js(g.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(k[0]).map(g=>e.jsx("th",{children:g},g))})}),e.jsx("tbody",{children:vs.filter(g=>!we||g.name.name.toLowerCase().includes(we.toLowerCase())).map(g=>e.jsx("tr",{children:Object.entries(g).map(([O,U])=>O==="name"?e.jsx("td",{children:e.jsx(Ot,{stat:g,metricFieldMap:be})},O):e.jsx("td",{children:U}))}))})]})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(c+1,j);h(g),t.set("metricsPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(c-1,1);h(g),t.set("metricsPage",String(g)),r(t)},currentPage:c,totalPages:j})]})]})}function Gt({groupsTables:s,activeGroup:t,sortable:r=!0,sortFirstMetric:n=!0}){const[l,o]=a.useState(n?1:void 0),[i,m]=a.useState({...s[t]}),[x,N]=a.useState(1);function S(d){return d.length>30?d.substring(0,27)+"...":d}const k=d=>d.value==="Model/adapter"?"Model":d.value.includes("-book")?S(d.value.replace("-book","")):S(d.value),[b,w]=a.useState(void 0);a.useEffect(()=>{const d=new AbortController;async function p(){const c=await H(d.signal);w(c)}return p(),()=>d.abort()},[]);const f=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.description;return c.includes("/")&&(c=c.replace("/","_")),c}}return""},v=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.name;return c.includes("/")&&(c=c.replace("/","_")),c}}return""};function R(d){const p=d.lastIndexOf(" - ");return p===-1?d:d.substring(0,p)+"*"+d.substring(p+1)}const u=d=>{const c=R(d).split("*")[0].trim();if(b){const h=b.run_groups.find(j=>j.display_name===c||j.short_display_name===c);if(h)return h.name}return""};a.useEffect(()=>{m({...s[t]})},[t,s]);const y=d=>{let p=x;l===d?p=p*-1:p=1,o(d),N(p),m(c=>{const h={...c};return h.rows.sort((j,_)=>{var C,$;const A=(C=j[d])==null?void 0:C.value,T=($=_[d])==null?void 0:$.value;return A!==void 0&&T===void 0?-1:T!==void 0&&A===void 0?1:typeof A=="number"&&typeof T=="number"?(A-T)*p:typeof A=="string"&&typeof T=="string"?p===1?A.localeCompare(T):T.localeCompare(A):0}),h})};return a.useEffect(()=>{n&&l&&y(l)},[n,l]),e.jsx(e.Fragment,{children:e.jsx("div",{children:e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:i.header.map((d,p)=>e.jsx("th",{className:`${p===l?"bg-gray-100":"bg-white"} ${p===0?"left-0 z-10":""} ${d.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:d.description?d.description:"",children:e.jsxs("div",{className:"flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:k(d)}),r?e.jsx("button",{className:"link",onClick:()=>y(p),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${p}`))})}),e.jsx("tbody",{children:i.rows.map((d,p)=>e.jsx("tr",{children:d.map((c,h)=>e.jsx("td",{className:`${h===0?"text-lg sticky left-0":""} ${p%2===0?"bg-gray-50":"bg-white"}`,children:h==1?e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Y,{value:{...c,href:"/runs/?q="+v(String(d[0].value))},title:`Click value to see all predictions for: ${v(String(d[0].value))}`})}):e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""} ${h===0?"underline decoration-dashed decoration-gray-300":""}`,children:e.jsx(Y,{value:{...c},title:String(d[0].value)===c.value?f(String(d[0].value)):`Click value to see predictions for ${u(k(i.header[h]))}: ${v(String(d[0].value))}`})})},`${t}-${h}`))},`${t}-${p}`))})]})})})})}function qt(){const[s,t]=a.useState([]),[r,n]=a.useState(),[l,o]=a.useState([]),[i,m]=a.useState(),[x,N]=a.useState(!0),[S,k]=a.useState(0);function b(f,v){console.log(f,v);const R=f.find(u=>u.title===v);return R??f[0]}function w(f,v){n(b(f,v))}return a.useEffect(()=>{const f=new AbortController;async function v(){const R=await fe(f.signal),u=[];if(R.forEach(c=>{c.rows.forEach(h=>{u.push({title:String(h[0].value),name:h[0].href.replace("?group=","")})})}),t(u),u.length===0)throw new Error("Could not find any groups!");const y=r?r.name:u[0].name,[d,p]=await Promise.all([pe(y,f.signal),je(f.signal)]);o(d),m(p[y]),N(!1)}return v(),()=>f.abort()},[r]),x||i===void 0?e.jsx(B,{}):l.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:i.display_name,subtitle:i.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(I,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",value:r?r.title:s[0].title,onChange:f=>w(s,f.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:s.map((f,v)=>e.jsx("option",{value:f.title,children:f.title},v))})]})]}),e.jsx("div",{className:"overflow-x-auto",children:l.length>1?e.jsx(re,{children:l.map((f,v)=>e.jsx(V,{active:v===S,onClick:()=>k(v),children:f.title},v))}):null}),e.jsx(Gt,{groupsTables:l,activeGroup:S,ignoreHref:!0})]})})}const Jt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Wt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Jt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Wt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function ge({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,r)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},r):e.jsx(E,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},r)}))})]})}function ve({runGroups:s}){const t=new Map(s.filter(l=>l.metric_groups!==void 0&&(l.subgroups===void 0||l.subgroups.length===0)).map(l=>[l.name,l])),r=new Set,n=[];return s.forEach(l=>{const o=l.subgroups?l.subgroups:[],i=[];o.forEach(m=>{const x=t.get(m);x&&(i.push(x),r.add(x.name))}),i.length>0&&n.push([l,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," scenarios"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx(E,{className:"text-black",to:"groups/"+l.name,children:e.jsx("h2",{children:l.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(E,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},l.name))})]})}const Ve=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function Kt({groupsTables:s,activeGroup:t,sortFirstMetric:r=!0,filteredCols:n=[],modelsToFilter:l=[],numModelsToAutoFilter:o=0}){const[i,m]=a.useState(r?1:void 0),[x,N]=a.useState({...s[t]}),[S,k]=a.useState(1),[b,w]=a.useState(l);function f(c){return c.length>30?c.substring(0,27)+"...":c}const v=c=>c.value==="Model/adapter"?"Model":c.value.includes("-book")?f(c.value.replace("-book","")):f(c.value),[R,u]=a.useState(void 0);a.useEffect(()=>{const c=new AbortController;async function h(){const j=await H(c.signal);u(j)}return h(),()=>c.abort()},[]);const y=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.description;return j.includes("/")&&(j=j.replace("/","_")),j}}return""},d=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.name;return j.includes("/")&&(j=j.replace("/","_")),j}}return""};a.useEffect(()=>{if(N({...s[t]}),o){const _=s[0].rows.sort((A,T)=>Number(T[1].value)-Number(A[1].value)).slice(0,o).map(A=>String(A[0].value));w(_)}},[t,s,o]);const p=c=>{let h=S;i===c?h=h*-1:h=1,m(c),k(h),N(j=>{const _={...j};return _.rows.sort((A,T)=>{var Z,Q;const C=(Z=A[c])==null?void 0:Z.value,$=(Q=T[c])==null?void 0:Q.value;return C!==void 0&&$===void 0?-1:$!==void 0&&C===void 0?1:typeof C=="number"&&typeof $=="number"?(C-$)*h:typeof C=="string"&&typeof $=="string"?h===1?C.localeCompare($):$.localeCompare(C):0}),_})};return a.useEffect(()=>{r&&i&&p(i)},[r,i]),e.jsx(e.Fragment,{children:e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table w-full",children:[e.jsx("thead",{children:e.jsx("tr",{children:x.header.filter((c,h)=>n.length===0||n.includes(h)).map((c,h)=>e.jsx("th",{className:`${h===i?"bg-gray-100":""} ${c.description?"underline decoration-dashed":""} whitespace-nowrap px-4 `,title:c.description?c.description:"",children:e.jsx("div",{className:"flex gap-2 items-center",children:e.jsx("span",{children:v(c)})})},`${t}-${h}`))})}),e.jsx("tbody",{children:x.rows.filter(c=>b.includes(String(c[0].value))).map((c,h)=>e.jsx("tr",{className:`${h%2===0?"bg-gray-50":""}`,children:c.filter((j,_)=>n.length===0||n.includes(_)).map((j,_)=>e.jsx("td",{className:`${_===0?"text-lg":""}`,children:e.jsx("div",{className:j&&j.style&&j.style["font-weight"]&&j.style["font-weight"]==="bold"?"font-bold":"",children:_===0?e.jsx(Y,{value:{...j},title:y(String(c[0].value)),hideIcon:!0}):e.jsx(Y,{value:{...j,href:"/runs/?q="+d(String(c[0].value))},title:`Click value to see all predictions for: ${d(String(c[0].value))}`})})},`${t}-${_}`))},`${t}-${h}`))})]})})})})}function Ye({numModelsToAutoFilter:s=6}){const[t,r]=a.useState([]),[n,l]=a.useState([]),[o,i]=a.useState(),[m,x]=a.useState(!0),N=0;return console.log(t),a.useEffect(()=>{const S=new AbortController;async function k(){const b=await fe(S.signal),w=[];if(b.forEach(u=>{u.rows.forEach(y=>{w.push({title:String(y[0].value),name:y[0].href.replace("?group=","")})})}),r(w),w.length===0)throw new Error("Could not find any groups!");const f=w[0].name,[v,R]=await Promise.all([pe(f,S.signal),je(S.signal)]);l(v),i(R[f]),x(!1)}return k(),()=>S.abort()},[]),m||o===void 0?e.jsx(B,{}):n.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:o.display_name,subtitle:o.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsx(e.Fragment,{children:e.jsx(Kt,{groupsTables:n,activeGroup:N,numModelsToAutoFilter:s,filteredCols:[0,1]})})})}function Vt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(Ye,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(E,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Xe=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Qe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,es=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,ss=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,ts=""+new URL("cohere-3550c6cb.png",import.meta.url).href,ns=""+new URL("eleutherai-b9451114.png",import.meta.url).href,rs=""+new URL("google-06d997ad.png",import.meta.url).href,as=""+new URL("meta-5580e9f1.png",import.meta.url).href,ls=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,is=""+new URL("mistral-18e1be23.png",import.meta.url).href,os=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,cs=""+new URL("openai-3f8653e4.png",import.meta.url).href,ds=""+new URL("tii-24de195c.png",import.meta.url).href,ms=""+new URL("together-a665a35b.png",import.meta.url).href,us=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,hs="",xs=""+new URL("yandex-38e09d70.png",import.meta.url).href,fs=""+new URL("01-694cb9b7.png",import.meta.url).href,Yt=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function Ce(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Vt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Yt.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})})]})]}):null}function Xt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-row md:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(Ye,{numModelsToAutoFilter:10})})]})]})}const Qt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function en({metricFieldMap:s,metricGroups:t}){const r=new Set,n=[];return t.forEach(l=>{const o=[];l.metrics.forEach(i=>{const m=s[i.name];m&&(o.push(m),r.add(m.name))}),o.length>0&&n.push([l,o])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," metrics"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:l.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},l.name))})]})}function sn(){const[s,t]=a.useState(void 0);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o)}return l(),()=>n.abort()},[]);const r=s?s.metrics.reduce((n,l)=>(n[l.name]=l,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Qt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&r?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups}),e.jsx(en,{metricFieldMap:r,metricGroups:s.metric_groups})]}):null]})}const tn=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,nn=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function rn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 6 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsx("img",{src:tn,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:"mx-auto lg:max-w-3xl block my-8"}),e.jsx("img",{src:nn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:"mx-auto lg:max-w-3xl block my-8"}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})]})}const an=({id:s,title:t,text:r})=>(t.includes("HE")||(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:xe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:r})]})})}));function ln(){const[s,t]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{t(r)}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-3 gap-4",children:s&&s.map((r,n)=>r.id==="home"?null:e.jsx(an,{id:r.id,title:r.title,text:r.description},n))})})}function on(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-36",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
|
|
10
|
+
mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const cn=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function dn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(on,{}),e.jsxs("div",{className:"container mt-30 mx-auto text-lg",children:[e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})}),e.jsx("div",{className:"flex flex-col sm:flex-row flex sm:gap-8 md:gap-32",children:e.jsx("body",{children:"HELM leaderboards leverage the HELM framework and target particular domains and/or capabilities. Leaderboards range from real world applications and specific domains to ones focused on multimodal capabilities and model-evaluations."})})]}),e.jsx(ln,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:cn.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]})})]}):null}function mn(){return window.PROJECT_ID==="lite"?e.jsx(Ce,{}):window.PROJECT_ID==="instruct"?e.jsx(Zt,{}):window.PROJECT_ID==="heim"?e.jsx(sn,{}):window.PROJECT_ID==="mmlu"?e.jsx(Xt,{}):window.PROJECT_ID==="vhelm"?e.jsx(rn,{}):window.PROJECT_ID==="home"?e.jsx(dn,{}):e.jsx(Ce,{})}function un(){return e.jsx(Ls,{children:e.jsx(As,{children:e.jsxs(D,{path:"/",element:e.jsx(ht,{}),children:[e.jsx(D,{index:!0,element:e.jsx(mn,{})}),e.jsx(D,{path:"leaderboard",element:e.jsx(qt,{})}),e.jsx(D,{path:"models",element:e.jsx(gt,{})}),e.jsx(D,{path:"scenarios",element:e.jsx(vt,{})}),e.jsx(D,{path:"groups",element:e.jsx(yt,{})}),e.jsx(D,{path:"groups/:groupName",element:e.jsx(Nt,{})}),e.jsx(D,{path:"runs",element:e.jsx(St,{})}),e.jsx(D,{path:"runs/:runName",element:e.jsx(zt,{})})]})})})}de.createRoot(document.getElementById("root")).render(e.jsx(Cs.StrictMode,{children:e.jsx(un,{})}));
|