crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import{r as a,a as Rs,L as E,O as Ms,d as ks,u as Pe,f as _e,H as Ls,h as As,i as D,R as Cs}from"./react-d4a0b69b.js";import{g as X,b as K,m as ce,s as Te,a as Ps,d as Me,y as _s,c as ke,e as ue,l as he}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const l of document.querySelectorAll('link[rel="modulepreload"]'))n(l);new MutationObserver(l=>{for(const o of l)if(o.type==="childList")for(const i of o.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&n(i)}).observe(document,{childList:!0,subtree:!0});function r(l){const o={};return l.integrity&&(o.integrity=l.integrity),l.referrerPolicy&&(o.referrerPolicy=l.referrerPolicy),l.crossOrigin==="use-credentials"?o.credentials="include":l.crossOrigin==="anonymous"?o.credentials="omit":o.credentials="same-origin",o}function n(l){if(l.ep)return;l.ep=!0;const o=r(l);fetch(l.href,o)}})();var $e={exports:{}},ne={};/**
|
|
2
|
+
* @license React
|
|
3
|
+
* react-jsx-runtime.production.min.js
|
|
4
|
+
*
|
|
5
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
6
|
+
*
|
|
7
|
+
* This source code is licensed under the MIT license found in the
|
|
8
|
+
* LICENSE file in the root directory of this source tree.
|
|
9
|
+
*/var Ts=a,$s=Symbol.for("react.element"),Us=Symbol.for("react.fragment"),Is=Object.prototype.hasOwnProperty,Ds=Ts.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Hs={key:!0,ref:!0,__self:!0,__source:!0};function Ue(s,t,r){var n,l={},o=null,i=null;r!==void 0&&(o=""+r),t.key!==void 0&&(o=""+t.key),t.ref!==void 0&&(i=t.ref);for(n in t)Is.call(t,n)&&!Hs.hasOwnProperty(n)&&(l[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)l[n]===void 0&&(l[n]=t[n]);return{$$typeof:$s,type:s,key:o,ref:i,props:l,_owner:Ds.current}}ne.Fragment=Us;ne.jsx=Ue;ne.jsxs=Ue;$e.exports=ne;var e=$e.exports,de={},Le=Rs;de.createRoot=Le.createRoot,de.hydrateRoot=Le.hydrateRoot;function Os({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Bs=a.forwardRef(Os),Ie=Bs;function Fs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const zs=a.forwardRef(Fs),Gs=zs;function qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Js=a.forwardRef(qs),Ws=Js,De=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,He=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Zs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Ks=a.forwardRef(Zs),Oe=Ks;function Vs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Ys=a.forwardRef(Vs),Xs=Ys;function Qs({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const et=a.forwardRef(Qs),st=et;function tt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const nt=a.forwardRef(tt),Be=nt;function rt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const at=a.forwardRef(rt),Fe=at;function lt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=a.forwardRef(lt),ot=it;function xe(s,t){return t?s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function ze(){const[s,t]=a.useState([]),[r,n]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(l=>l.json()).then(l=>{if(t(l),window.PROJECT_ID){const o=l.find(i=>i.id===window.PROJECT_ID);n(o)}else{const o=l.find(i=>i.id==="lite");n(o)}}).catch(l=>{console.error("Error fetching JSON:",l)})},[]),r===void 0||r.title===void 0?null:e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[r.title," ",e.jsx(Be,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((l,o)=>e.jsx("li",{children:e.jsxs("a",{href:xe(void 0,l.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:r.title===l.title?"underline":"",children:l.title}),": ",l.description]})},o))})]})}function P(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function W(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ct(s){try{return await(await fetch(P(`${W()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=a.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[r,n]=a.useState();a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(m=>m.json()).then(m=>{if(window.PROJECT_ID){const x=m.find(N=>N.id===window.PROJECT_ID);n(x)}else{const x=m.find(N=>N.id==="lite");n(x)}}).catch(m=>{console.error("Error fetching JSON:",m)})},[]);function l(){return r!==void 0&&r.releases!==void 0?r.releases:["v1.0.0"]}a.useEffect(()=>{const m=new AbortController;async function x(){const N=await ct(m.signal);t(N)}return x(),()=>m.abort()},[]);const o=l();if(!s.release&&!s.suite)return null;const i=`Release ${s.release||s.suite} (${s.date})`;return o.length<=1?e.jsx("div",{children:i}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",e.jsx(Be,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:o.map(m=>e.jsx("li",{children:e.jsx("a",{href:xe(m,r?r.id:"lite"),className:"block",role:"menuitem",children:m})}))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(Ie,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:De,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(ze,{})]})]})}function ht(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="global"?e.jsx(ut,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Ms,{})})})]})}async function H(s){try{return await(await fetch(P(`${W()}/schema.json`),{signal:s})).json()}catch(t){return console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function xt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function q({value:s}){return e.jsx("span",{children:e.jsx(ks,{components:{a:xt},children:s})})}function I({title:s,subtitle:t,markdown:r=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),r&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(q,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},pt={open:"Open",limited:"Limited",closed:"Closed"};function jt({level:s}){return e.jsx(X,{color:ft[s],children:pt[s]})}function B(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function gt(){const[s,t]=a.useState([]);a.useEffect(()=>{const i=new AbortController;async function m(){const x=await H(i.signal);t(x.models)}return m(),()=>i.abort()},[]);const[r,n,l]=s.reduce((i,m)=>{switch(m.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),o=Object.values(s.reduce((i,m)=>{const x=m.creator_organization;return i[x]===void 0?(i[x]={name:x,models:1},i):(i[x].models+=1,i)},{}));return s.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(q,{value:i.description})}),e.jsx("td",{children:e.jsx(jt,{level:i.access})})]}))})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(K,{className:"flex flex-col justify-between",children:[e.jsx(ce,{children:"Models"}),e.jsx(Te,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ps,{values:[r,n,l],colors:["green","yellow","red"]}),e.jsx(Me,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(K,{className:"md:col-span-2",children:[e.jsx(ce,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(_s,{data:o,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(Me,{categories:o.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function te({to:s,children:t,inTable:r=!1,title:n=""}){return r?e.jsx(E,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(E,{className:"link link-primary link-hover",to:s,children:t})}function vt(){const[s,t]=a.useState([]);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o.run_groups.filter(i=>!i.todo&&i.taxonomy&&!i.display_name.includes("CLEVA")))}return l(),()=>n.abort()},[]);const r=Object.values(s.reduce((n,l)=>{var i;const o=((i=l.taxonomy)==null?void 0:i.task)||"Unknown";return n[o]===void 0?(n[o]={name:o,value:1},n):(n[o].value+=1,n)},{}));return s.length===0?e.jsx(B,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(n=>{var l,o,i,m,x;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(te,{to:`/groups/${n.name}`,children:e.jsx("span",{className:"text-lg",children:n.display_name})}),e.jsx("span",{className:"block",children:n.name})]}),e.jsx("td",{children:((l=n.taxonomy)==null?void 0:l.task)||""}),e.jsx("td",{children:((o=n.taxonomy)==null?void 0:o.what)||""}),e.jsx("td",{children:((i=n.taxonomy)==null?void 0:i.who)||""}),e.jsx("td",{children:((m=n.taxonomy)==null?void 0:m.when)||""}),e.jsx("td",{children:((x=n.taxonomy)==null?void 0:x.language)||""}),e.jsx("td",{children:e.jsx(q,{value:n.description})})]})})})]}),e.jsx(I,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(K,{className:"flex flex-col",children:[e.jsx(ce,{children:"Total scenarios"}),e.jsx(Te,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(K,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ke,{data:r.slice(0,Math.floor(r.length/2))}),e.jsx(ke,{data:r.slice(Math.ceil(r.length/2))})]})})]})]})]}))}function Ge(){return P(`${W()}/groups.json`)}async function fe(s){try{return await(await fetch(Ge(),{signal:s})).json()}catch(t){return console.log(t),[]}}function re({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function V({active:s=!1,onClick:t=()=>{},size:r="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${r} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function bt({title:s,titleId:t,...r},n){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),a.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const wt=a.forwardRef(bt),Ae=wt;function G(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Y({value:s,title:t,hideIcon:r}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const n=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const l="/runs/?q="+s.run_spec_names.map(i=>`^${i}$`).join("|");return encodeURI(l)}})();return n?e.jsx(te,{to:n,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}return s.href?e.jsx(te,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[G(s.value),!r&&e.jsx(Ae,{className:"w-3 h-3 ml-1 opacity-30"})]})}):s.markdown?e.jsx(q,{value:String(s.value)}):t?e.jsx("a",{title:t,children:G(s.value)}):e.jsx(e.Fragment,{children:G(s.value)})}function qe({groupsTables:s,activeGroup:t,ignoreHref:r=!1,sortable:n=!0,sortFirstMetric:l=!0}){const[o,i]=a.useState(l?1:void 0),[m,x]=a.useState({...s[t]}),[N,S]=a.useState(1);a.useEffect(()=>{x({...s[t]})},[t,s]);const k=b=>{let w=N;o===b?w=w*-1:w=1,i(b),S(w),x(f=>{const v={...f};return v.rows.sort((R,u)=>{var p,c;const y=(p=R[b])==null?void 0:p.value,d=(c=u[b])==null?void 0:c.value;return y!==void 0&&d===void 0?-1:d!==void 0&&y===void 0?1:typeof y=="number"&&typeof d=="number"?(y-d)*w:typeof y=="string"&&typeof d=="string"?w===1?y.localeCompare(d):d.localeCompare(y):0}),v})};return a.useEffect(()=>{l&&o&&k(o)},[l,o]),e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:m.header.map((b,w)=>e.jsx("th",{className:`${w===o?"bg-gray-100 ":"bg-white"} ${w===0?"left-0 z-10":""} whitespace-nowrap sticky top-0`,children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:b.value}),n?e.jsx("button",{className:"link",onClick:()=>k(w),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${w}`))})}),e.jsx("tbody",{children:m.rows.map((b,w)=>e.jsx("tr",{children:b.map((f,v)=>e.jsx("td",{className:`${v==0?"text-lg sticky left-0":""} ${o===v?"bg-gray-100":"bg-white"}`,children:e.jsx("div",{className:f&&f.style&&f.style["font-weight"]&&f.style["font-weight"]==="bold"?"font-bold":"",children:e.jsx(Y,{ignoreHref:r&&v===0,value:f})})},`${t}-${v}`))},`${t}-${w}`))})]})})}function yt(){const[s,t]=a.useState(0),[r,n]=a.useState([]),[l,o]=a.useState([]);return a.useEffect(()=>{const i=new AbortController;async function m(){const x=await fe(i.signal);o(x),n(x.map(N=>N.title))}return m(),()=>i.abort()},[]),l.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx(I,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("a",{className:"flex link-primary space-between items-center self-end link link-hover block",href:Ge(),download:"true",target:"_blank",children:[e.jsx(Oe,{className:"w-6 h-6 mr-2"})," JSON"]})]}),e.jsx("div",{children:e.jsx(re,{children:r.map((i,m)=>e.jsx(V,{onClick:()=>t(m),active:s===m,size:"lg",children:i},m))})}),e.jsx("div",{className:"mt-8",children:e.jsx(qe,{sortable:!1,groupsTables:l,activeGroup:s})})]})}async function pe(s,t){try{return await(await fetch(P(`${W()}/groups/${s}.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function je(s){try{return await(await fetch(P(`${W()}/groups_metadata.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Nt(){const{groupName:s}=Pe(),[t,r]=a.useState([]),[n,l]=a.useState(),[o,i]=a.useState(!0),[m,x]=a.useState(0);return a.useEffect(()=>{const N=new AbortController;async function S(){if(s===void 0)return;const[k,b]=await Promise.all([pe(s,N.signal),je(N.signal)]);r(k),l(b[s]),i(!1)}return S(),()=>N.abort()},[s]),o||n===void 0?e.jsx(B,{}):t.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8"}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex flex-row justify-between",children:e.jsx(I,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8 mb-16"})}),e.jsx("div",{className:"overflow-x-auto",children:t.length>1?e.jsx(re,{children:t.map((N,S)=>e.jsx(V,{active:S===m,onClick:()=>x(S),children:N.title},S))}):null}),e.jsx(qe,{groupsTables:t,activeGroup:m,ignoreHref:!0})]})}async function Je(s){try{return await(await fetch(P(`${W()}/run_specs.json`),{signal:s})).json()}catch(t){return console.log(t),[]}}function me({currentPage:s,totalPages:t,onNextPage:r,onPrevPage:n,className:l}){let o="join";return l!==void 0&&(o=`join ${l}`),e.jsxs("div",{className:o,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:r,className:"join-item btn",children:"»"})]})}const le=100;function St(){const[s,t]=_e(),[r,n]=a.useState([]),[l,o]=a.useState(Number(s.get("page")||1)),[i,m]=a.useState(1),[x,N]=a.useState([]),[S,k]=a.useState(!0),[b,w]=a.useState(s.get("q")||"");a.useEffect(()=>{const u=new AbortController;async function y(){const d=await Je(u.signal);n(d),f(b,d)}return y(),()=>u.abort()},[b]),a.useEffect(()=>{f(b,r)},[r,b]);function f(u,y){const d=S?new RegExp(u):null,p=y.filter(c=>d?d.test(c.name):c.name.includes(u));N(p),m(Math.ceil(p.length/le))}const v=u=>{u.preventDefault();const d=u.target.q.value;w(d),t({q:d,page:"1"}),f(d,r)},R=x.slice((l-1)*le,l*le);return r.length===0?e.jsx(B,{}):e.jsxs(e.Fragment,{children:[e.jsx(I,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:v,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:b,onChange:u=>w(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:S,onChange:()=>k(!S)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${x.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ot,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:R.map((u,y)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(te,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${y}`))})]})}),i>0?e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(l+1,i);o(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(l-1,1);o(u),s.set("page",String(u)),t(s)},currentPage:l,totalPages:i}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function F(){return window.SUITE!==void 0?window.SUITE:void 0}async function Et(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/instances.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Rt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/stats.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Mt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_requests.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function kt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/display_predictions.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Lt(s,t,r){try{return await(await fetch(P(`/runs/${r||F()}/${s}/scenario.json`),{signal:t})).json()}catch(n){console.log(n);return}}function At(s,t){return P(`/runs/${t||F()}/${s}/run_spec.json`)}function Ct(s,t){return P(`/runs/${t||F()}/${s}/scenario_state.json`)}function ie({value:s}){return e.jsx(X,{icon:Gs,color:"green",children:s})}function oe({value:s}){return e.jsx(X,{icon:Ws,color:"red",children:s})}function Pt({stats:s}){if(s.quasi_exact_match!==void 0)return s.quasi_exact_match?e.jsx(ie,{value:`quasi exact match: ${s.quasi_exact_match}`}):e.jsx(oe,{value:`quasi exact match: ${s.quasi_exact_match}`});if(s.toxic_frac!==void 0)return s.toxic_frac>0?e.jsx(oe,{value:`toxic frac: ${s.toxic_frac}`}):e.jsx(ie,{value:`toxic frac: ${s.toxic_frac}`});if(s.exact_match!==void 0)return s.exact_match>0?e.jsx(ie,{value:`exact match: ${s.exact_match}`}):e.jsx(oe,{value:`exact match: ${s.exact_match}`})}function J({value:s}){const[t,r]=a.useState(!1),[n,l]=a.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>r(!0),onMouseOut:()=>r(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>l(!0),children:e.jsx(st,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>l(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function We({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=P(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ze({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(We,{mediaObject:t}))})}function _t(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Tt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(J,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ze,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(ue,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,r)=>e.jsxs(he,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:_t(s.request[t])}):"null"]},r+1))})]})}function $t({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,r])=>e.jsxs("div",{children:[e.jsx("h3",{children:e.jsx("strong",{children:t})}),r.map((n,l)=>e.jsxs("div",{children:[n.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(J,{value:n.error})," "]}),n.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(J,{value:n.text})," "]}),n.media_object&&e.jsx(We,{mediaObject:n.media_object})]},l))]},t)):null})}function Ut({predictions:s,requests:t,metricFieldMap:r}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,l)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",l]}):null,e.jsx("div",{className:"mt-2 w-full",children:n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(o=>e.jsx("img",{src:"data:image;base64,"+o,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Pt,{stats:n.stats})]}),e.jsx(J,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(J,{value:String(n.mapped_output)})]}):null]})}),e.jsx($t,{predictionAnnotations:n.annotations}),e.jsx("h3",{children:"Metrics"}),e.jsx(ue,{children:Object.keys(n.stats).map((o,i)=>e.jsxs(he,{children:[r[o]?e.jsx("span",{title:r[o].description,children:r[o].display_name}):e.jsx("span",{children:o}),e.jsx("span",{children:String(n.stats[o])})]},i))}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Tt,{request:t[l]})})]})]},l))})})}const It="correct";function Dt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,r)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(n=>e.jsx(X,{className:"mx-2",color:n===It?"green":void 0,children:n}))]},r))})]})}function Ht({instance:s,requests:t,predictions:r,metricFieldMap:n}){return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:`Instance id: ${s.id} [split: ${s.split}]`}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ze,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(J,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Dt,{references:s.references}):null}),e.jsx("div",{children:r&&t?e.jsx(Ut,{predictions:r,requests:t,metricFieldMap:n}):null})]})}function Ot({stat:s,metricFieldMap:t}){const r=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),r]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),r]})}function Ke(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Bt(s){try{return await(await fetch(P(`/releases/${Ke()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Ft(s,t){return Ke()?s[t]:window.SUITE}const ee=10,se=50;function zt(){const{runName:s}=Pe(),[t,r]=_e(),[n,l]=a.useState(0),[o,i]=a.useState(),[m,x]=a.useState(),[N,S]=a.useState([]),[k,b]=a.useState([]),[w,f]=a.useState(),[v,R]=a.useState(),[u,y]=a.useState(1),[d,p]=a.useState(1),[c,h]=a.useState(1),[j,_]=a.useState(1),[A,T]=a.useState(),[C,$]=a.useState(),[Z,Q]=a.useState({}),[be,ps]=a.useState({}),[we,js]=a.useState("");if(a.useEffect(()=>{const g=new AbortController;async function O(){const U=g.signal;if(s===void 0)return()=>g.abort();const z=window.SUITE?window.SUITE:Ft(await Bt(U),s);x(z);const[ye,Ne,Se,bs,ws,ys]=await Promise.all([Je(U),Et(s,U,z),Rt(s,U,z),Lt(s,U,z),kt(s,U,z),Mt(s,U,z)]);i(ye.find(M=>M.name===s)),S(Ne);const Ee=Math.ceil(Ne.length/ee),Ns=Number(t.get("instancesPage")||1);p(Ee),y(Math.max(Math.min(Ns,Ee),1)),b(Se),$(bs);const Re=Math.floor(Se.length/se),Ss=Number(t.get("metricsPage")||1);_(Re),h(Math.max(Math.min(Ss,Re),1)),f(ws.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{})),R(ys.reduce((M,L)=>(M[L.instance_id]===void 0&&(M[L.instance_id]=[]),M[L.instance_id].push(L),M),{}));const ae=await H(U);ps(ae.metrics.reduce((M,L)=>(M[L.name]=L,M),{})),Q(ae.adapter.reduce((M,L)=>(M[L.name]=L,M),{})),T(ae.models.find(M=>{var L;return M.name===((L=ye.find(Es=>Es.name===s))==null?void 0:L.adapter_spec.model)}))}return O(),()=>g.abort()},[s,t]),o===void 0||w===void 0||v===void 0||C===void 0)return e.jsx(B,{});const gs=N.slice((u-1)*ee,(u-1)*ee+ee),vs=k.slice((c-1)*se,(c-1)*se+se);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[C.name,e.jsx("a",{href:"/#/groups/"+C.name,children:e.jsx(Xs,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:C.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:o.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(q,{value:(A==null?void 0:A.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:C.tags.map(g=>e.jsx(X,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:g})}))})]})}),e.jsxs(K,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Oe,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:At(o.name,m),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Ct(o.name,m),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(ue,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(o.adapter_spec).map(([g,O],U)=>e.jsxs(he,{className:U<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:Z[g]?Z[g].description:void 0,children:`${g}: `}),e.jsx("span",{className:"overflow-x-auto",children:O})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(re,{children:[e.jsx(V,{size:"lg",active:n===0,onClick:()=>l(0),children:"Instances + Predictions"}),e.jsx(V,{size:"lg",active:n===1,onClick:()=>l(1),children:"All metrics"})]})}),n===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:gs.map((g,O)=>e.jsx(Ht,{instance:g,requests:v[g.id],predictions:w[g.id],metricFieldMap:be},`${g.id}-${O}`))}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(u+1,d);y(g),t.set("instancesPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(u-1,1);y(g),t.set("instancesPage",String(g)),r(t)},currentPage:u,totalPages:d})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:g=>js(g.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(k[0]).map(g=>e.jsx("th",{children:g},g))})}),e.jsx("tbody",{children:vs.filter(g=>!we||g.name.name.toLowerCase().includes(we.toLowerCase())).map(g=>e.jsx("tr",{children:Object.entries(g).map(([O,U])=>O==="name"?e.jsx("td",{children:e.jsx(Ot,{stat:g,metricFieldMap:be})},O):e.jsx("td",{children:U}))}))})]})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(c+1,j);h(g),t.set("metricsPage",String(g)),r(t)},onPrevPage:()=>{const g=Math.max(c-1,1);h(g),t.set("metricsPage",String(g)),r(t)},currentPage:c,totalPages:j})]})]})}function Gt({groupsTables:s,activeGroup:t,sortable:r=!0,sortFirstMetric:n=!0}){const[l,o]=a.useState(n?1:void 0),[i,m]=a.useState({...s[t]}),[x,N]=a.useState(1);function S(d){return d.length>30?d.substring(0,27)+"...":d}const k=d=>d.value==="Model/adapter"?"Model":d.value.includes("-book")?S(d.value.replace("-book","")):S(d.value),[b,w]=a.useState(void 0);a.useEffect(()=>{const d=new AbortController;async function p(){const c=await H(d.signal);w(c)}return p(),()=>d.abort()},[]);const f=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.description;return c.includes("/")&&(c=c.replace("/","_")),c}}return""},v=d=>{if(b){const p=b.models.find(c=>c.display_name===d);if(p){let c=p.name;return c.includes("/")&&(c=c.replace("/","_")),c}}return""};function R(d){const p=d.lastIndexOf(" - ");return p===-1?d:d.substring(0,p)+"*"+d.substring(p+1)}const u=d=>{const c=R(d).split("*")[0].trim();if(b){const h=b.run_groups.find(j=>j.display_name===c||j.short_display_name===c);if(h)return h.name}return""};a.useEffect(()=>{m({...s[t]})},[t,s]);const y=d=>{let p=x;l===d?p=p*-1:p=1,o(d),N(p),m(c=>{const h={...c};return h.rows.sort((j,_)=>{var C,$;const A=(C=j[d])==null?void 0:C.value,T=($=_[d])==null?void 0:$.value;return A!==void 0&&T===void 0?-1:T!==void 0&&A===void 0?1:typeof A=="number"&&typeof T=="number"?(A-T)*p:typeof A=="string"&&typeof T=="string"?p===1?A.localeCompare(T):T.localeCompare(A):0}),h})};return a.useEffect(()=>{n&&l&&y(l)},[n,l]),e.jsx(e.Fragment,{children:e.jsx("div",{children:e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:i.header.map((d,p)=>e.jsx("th",{className:`${p===l?"bg-gray-100":"bg-white"} ${p===0?"left-0 z-10":""} ${d.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:d.description?d.description:"",children:e.jsxs("div",{className:"flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:k(d)}),r?e.jsx("button",{className:"link",onClick:()=>y(p),children:e.jsx(Fe,{className:"w-6 h-6"})}):null]})},`${t}-${p}`))})}),e.jsx("tbody",{children:i.rows.map((d,p)=>e.jsx("tr",{children:d.map((c,h)=>e.jsx("td",{className:`${h===0?"text-lg sticky left-0":""} ${p%2===0?"bg-gray-50":"bg-white"}`,children:h==1?e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Y,{value:{...c,href:"/runs/?q="+v(String(d[0].value))},title:`Click value to see all predictions for: ${v(String(d[0].value))}`})}):e.jsx("div",{className:`${c&&c.style&&c.style["font-weight"]&&c.style["font-weight"]==="bold"?"font-bold":""} ${h===0?"underline decoration-dashed decoration-gray-300":""}`,children:e.jsx(Y,{value:{...c},title:String(d[0].value)===c.value?f(String(d[0].value)):`Click value to see predictions for ${u(k(i.header[h]))}: ${v(String(d[0].value))}`})})},`${t}-${h}`))},`${t}-${p}`))})]})})})})}function qt(){const[s,t]=a.useState([]),[r,n]=a.useState(),[l,o]=a.useState([]),[i,m]=a.useState(),[x,N]=a.useState(!0),[S,k]=a.useState(0);function b(f,v){console.log(f,v);const R=f.find(u=>u.title===v);return R??f[0]}function w(f,v){n(b(f,v))}return a.useEffect(()=>{const f=new AbortController;async function v(){const R=await fe(f.signal),u=[];if(R.forEach(c=>{c.rows.forEach(h=>{u.push({title:String(h[0].value),name:h[0].href.replace("?group=","")})})}),t(u),u.length===0)throw new Error("Could not find any groups!");const y=r?r.name:u[0].name,[d,p]=await Promise.all([pe(y,f.signal),je(f.signal)]);o(d),m(p[y]),N(!1)}return v(),()=>f.abort()},[r]),x||i===void 0?e.jsx(B,{}):l.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:i.display_name,subtitle:i.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(I,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",value:r?r.title:s[0].title,onChange:f=>w(s,f.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:s.map((f,v)=>e.jsx("option",{value:f.title,children:f.title},v))})]})]}),e.jsx("div",{className:"overflow-x-auto",children:l.length>1?e.jsx(re,{children:l.map((f,v)=>e.jsx(V,{active:v===S,onClick:()=>k(v),children:f.title},v))}):null}),e.jsx(Gt,{groupsTables:l,activeGroup:S,ignoreHref:!0})]})})}const Jt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Wt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Jt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Wt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function ge({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,r)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},r):e.jsx(E,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},r)}))})]})}function ve({runGroups:s}){const t=new Map(s.filter(l=>l.metric_groups!==void 0&&(l.subgroups===void 0||l.subgroups.length===0)).map(l=>[l.name,l])),r=new Set,n=[];return s.forEach(l=>{const o=l.subgroups?l.subgroups:[],i=[];o.forEach(m=>{const x=t.get(m);x&&(i.push(x),r.add(x.name))}),i.length>0&&n.push([l,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," scenarios"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx(E,{className:"text-black",to:"groups/"+l.name,children:e.jsx("h2",{children:l.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(E,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},l.name))})]})}const Ve=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function Kt({groupsTables:s,activeGroup:t,sortFirstMetric:r=!0,filteredCols:n=[],modelsToFilter:l=[],numModelsToAutoFilter:o=0}){const[i,m]=a.useState(r?1:void 0),[x,N]=a.useState({...s[t]}),[S,k]=a.useState(1),[b,w]=a.useState(l);function f(c){return c.length>30?c.substring(0,27)+"...":c}const v=c=>c.value==="Model/adapter"?"Model":c.value.includes("-book")?f(c.value.replace("-book","")):f(c.value),[R,u]=a.useState(void 0);a.useEffect(()=>{const c=new AbortController;async function h(){const j=await H(c.signal);u(j)}return h(),()=>c.abort()},[]);const y=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.description;return j.includes("/")&&(j=j.replace("/","_")),j}}return""},d=c=>{if(R){const h=R.models.find(j=>j.display_name===c);if(h){let j=h.name;return j.includes("/")&&(j=j.replace("/","_")),j}}return""};a.useEffect(()=>{if(N({...s[t]}),o){const _=s[0].rows.sort((A,T)=>Number(T[1].value)-Number(A[1].value)).slice(0,o).map(A=>String(A[0].value));w(_)}},[t,s,o]);const p=c=>{let h=S;i===c?h=h*-1:h=1,m(c),k(h),N(j=>{const _={...j};return _.rows.sort((A,T)=>{var Z,Q;const C=(Z=A[c])==null?void 0:Z.value,$=(Q=T[c])==null?void 0:Q.value;return C!==void 0&&$===void 0?-1:$!==void 0&&C===void 0?1:typeof C=="number"&&typeof $=="number"?(C-$)*h:typeof C=="string"&&typeof $=="string"?h===1?C.localeCompare($):$.localeCompare(C):0}),_})};return a.useEffect(()=>{r&&i&&p(i)},[r,i]),e.jsx(e.Fragment,{children:e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table w-full",children:[e.jsx("thead",{children:e.jsx("tr",{children:x.header.filter((c,h)=>n.length===0||n.includes(h)).map((c,h)=>e.jsx("th",{className:`${h===i?"bg-gray-100":""} ${c.description?"underline decoration-dashed":""} whitespace-nowrap px-4 `,title:c.description?c.description:"",children:e.jsx("div",{className:"flex gap-2 items-center",children:e.jsx("span",{children:v(c)})})},`${t}-${h}`))})}),e.jsx("tbody",{children:x.rows.filter(c=>b.includes(String(c[0].value))).map((c,h)=>e.jsx("tr",{className:`${h%2===0?"bg-gray-50":""}`,children:c.filter((j,_)=>n.length===0||n.includes(_)).map((j,_)=>e.jsx("td",{className:`${_===0?"text-lg":""}`,children:e.jsx("div",{className:j&&j.style&&j.style["font-weight"]&&j.style["font-weight"]==="bold"?"font-bold":"",children:_===0?e.jsx(Y,{value:{...j},title:y(String(c[0].value)),hideIcon:!0}):e.jsx(Y,{value:{...j,href:"/runs/?q="+d(String(c[0].value))},title:`Click value to see all predictions for: ${d(String(c[0].value))}`})})},`${t}-${_}`))},`${t}-${h}`))})]})})})})}function Ye({numModelsToAutoFilter:s=6}){const[t,r]=a.useState([]),[n,l]=a.useState([]),[o,i]=a.useState(),[m,x]=a.useState(!0),N=0;return console.log(t),a.useEffect(()=>{const S=new AbortController;async function k(){const b=await fe(S.signal),w=[];if(b.forEach(u=>{u.rows.forEach(y=>{w.push({title:String(y[0].value),name:y[0].href.replace("?group=","")})})}),r(w),w.length===0)throw new Error("Could not find any groups!");const f=w[0].name,[v,R]=await Promise.all([pe(f,S.signal),je(S.signal)]);l(v),i(R[f]),x(!1)}return k(),()=>S.abort()},[]),m||o===void 0?e.jsx(B,{}):n.length===0?e.jsxs(e.Fragment,{children:[e.jsx(I,{title:o.display_name,subtitle:o.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsx(e.Fragment,{children:e.jsx(Kt,{groupsTables:n,activeGroup:N,numModelsToAutoFilter:s,filteredCols:[0,1]})})})}function Vt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(Ye,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(E,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Xe=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Qe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,es=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,ss=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,ts=""+new URL("cohere-3550c6cb.png",import.meta.url).href,ns=""+new URL("eleutherai-b9451114.png",import.meta.url).href,rs=""+new URL("google-06d997ad.png",import.meta.url).href,as=""+new URL("meta-5580e9f1.png",import.meta.url).href,ls=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,is=""+new URL("mistral-18e1be23.png",import.meta.url).href,os=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,cs=""+new URL("openai-3f8653e4.png",import.meta.url).href,ds=""+new URL("tii-24de195c.png",import.meta.url).href,ms=""+new URL("together-a665a35b.png",import.meta.url).href,us=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,hs="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",xs=""+new URL("yandex-38e09d70.png",import.meta.url).href,fs=""+new URL("01-694cb9b7.png",import.meta.url).href,Yt=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function Ce(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Vt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Yt.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})})]})]}):null}function Xt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-row md:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(Ye,{numModelsToAutoFilter:10})})]})]})}const Qt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function en({metricFieldMap:s,metricGroups:t}){const r=new Set,n=[];return t.forEach(l=>{const o=[];l.metrics.forEach(i=>{const m=s[i.name];m&&(o.push(m),r.add(m.name))}),o.length>0&&n.push([l,o])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," metrics"]}),e.jsx("ul",{children:n.map(([l,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:l.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},l.name))})]})}function sn(){const[s,t]=a.useState(void 0);a.useEffect(()=>{const n=new AbortController;async function l(){const o=await H(n.signal);t(o)}return l(),()=>n.abort()},[]);const r=s?s.metrics.reduce((n,l)=>(n[l.name]=l,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Qt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&r?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups}),e.jsx(en,{metricFieldMap:r,metricGroups:s.metric_groups})]}):null]})}const tn=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,nn=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function rn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 6 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsx("img",{src:tn,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:"mx-auto lg:max-w-3xl block my-8"}),e.jsx("img",{src:nn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:"mx-auto lg:max-w-3xl block my-8"}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ge,{models:s.models}),e.jsx(ve,{runGroups:s.run_groups})]})]})}const an=({id:s,title:t,text:r})=>(t.includes("HE")||(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:xe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:r})]})})}));function ln(){const[s,t]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{t(r)}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-3 gap-4",children:s&&s.map((r,n)=>r.id==="home"?null:e.jsx(an,{id:r.id,title:r.title,text:r.description},n))})})}function on(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-36",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
|
|
10
|
+
mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ve,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const cn=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function dn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const r=new AbortController;async function n(){const l=await H(r.signal);t(l)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(on,{}),e.jsxs("div",{className:"container mt-30 mx-auto text-lg",children:[e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})}),e.jsx("div",{className:"flex flex-col sm:flex-row flex sm:gap-8 md:gap-32",children:e.jsx("body",{children:"HELM leaderboards leverage the HELM framework and target particular domains and/or capabilities. Leaderboards range from real world applications and specific domains to ones focused on multimodal capabilities and model-evaluations."})})]}),e.jsx(ln,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:cn.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]})})]}):null}function mn(){return window.PROJECT_ID==="lite"?e.jsx(Ce,{}):window.PROJECT_ID==="instruct"?e.jsx(Zt,{}):window.PROJECT_ID==="heim"?e.jsx(sn,{}):window.PROJECT_ID==="mmlu"?e.jsx(Xt,{}):window.PROJECT_ID==="vhelm"?e.jsx(rn,{}):window.PROJECT_ID==="home"?e.jsx(dn,{}):e.jsx(Ce,{})}function un(){return e.jsx(Ls,{children:e.jsx(As,{children:e.jsxs(D,{path:"/",element:e.jsx(ht,{}),children:[e.jsx(D,{index:!0,element:e.jsx(mn,{})}),e.jsx(D,{path:"leaderboard",element:e.jsx(qt,{})}),e.jsx(D,{path:"models",element:e.jsx(gt,{})}),e.jsx(D,{path:"scenarios",element:e.jsx(vt,{})}),e.jsx(D,{path:"groups",element:e.jsx(yt,{})}),e.jsx(D,{path:"groups/:groupName",element:e.jsx(Nt,{})}),e.jsx(D,{path:"runs",element:e.jsx(St,{})}),e.jsx(D,{path:"runs/:runName",element:e.jsx(zt,{})})]})})})}de.createRoot(document.getElementById("root")).render(e.jsx(Cs.StrictMode,{children:e.jsx(un,{})}));
|